From dae57f9d835c954aabd8c5240805c94edf5ae107 Mon Sep 17 00:00:00 2001 From: Huy Vu2 Date: Fri, 14 Nov 2025 08:00:27 -0800 Subject: [PATCH 01/20] adding tests --- .../flow_matching/flow_inference_pipeline.py | 8 +- .../data/wan/test_wan_energon_datamodule.py | 57 ++++++++ .../data/wan/test_wan_mock_datamodule.py | 54 +++++++ .../megatron/data/wan/test_wan_taskencoder.py | 135 ++++++++++++++++++ .../test_flow_inference_pipeline.py | 71 +++++++++ .../wan/flow_matching/test_flow_pipeline.py | 104 ++++++++++++++ .../flow_matching/test_time_shift_utils.py | 54 +++++++ .../wan/inference/test_inference_init.py | 29 ++++ .../wan/inference/test_inference_utils.py | 65 +++++++++ .../megatron/model/wan/test_rope_utils.py | 38 +++++ .../megatron/model/wan/test_utils.py | 36 +++++ .../megatron/model/wan/test_wan_layer_spec.py | 14 ++ .../megatron/model/wan/test_wan_model_misc.py | 13 ++ .../megatron/model/wan/test_wan_provider.py | 55 +++++++ .../megatron/model/wan/test_wan_step.py | 50 +++++++ 15 files changed, 779 insertions(+), 4 deletions(-) create mode 100644 tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py create mode 100644 tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py create mode 100644 tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py create mode 100644 tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py create mode 100644 tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py create mode 100644 tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py create mode 100644 tests/unit_tests/megatron/model/wan/inference/test_inference_init.py create mode 100644 tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py create mode 100644 tests/unit_tests/megatron/model/wan/test_rope_utils.py create mode 100644 tests/unit_tests/megatron/model/wan/test_utils.py create mode 100644 tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py create mode 100644 tests/unit_tests/megatron/model/wan/test_wan_model_misc.py create mode 100644 tests/unit_tests/megatron/model/wan/test_wan_provider.py create mode 100644 tests/unit_tests/megatron/model/wan/test_wan_step.py diff --git a/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py b/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py index 2bbb0eb3..459876b2 100644 --- a/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py +++ b/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py @@ -229,13 +229,13 @@ def forward_pp_step( """ pp_world_size = parallel_state.get_pipeline_model_parallel_world_size() - is_pp_first = parallel_state.is_pipeline_first_stage(ignore_virtual=True) - is_pp_last = parallel_state.is_pipeline_last_stage(ignore_virtual=True) - - # PP=1: no pipeline parallelism + # PP=1: no pipeline parallelism (avoid touching PP groups which may be uninitialized in unit tests) if pp_world_size == 1: noise_pred_pp = self.model(latent_model_input, grid_sizes=grid_sizes, t=timestep, **arg_c) return noise_pred_pp + # For PP>1, safe to query stage information + is_pp_first = parallel_state.is_pipeline_first_stage(ignore_virtual=True) + is_pp_last = parallel_state.is_pipeline_last_stage(ignore_virtual=True) # PP>1: pipeline parallelism hidden_size = self.model.config.hidden_size diff --git a/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py b/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py new file mode 100644 index 00000000..6fb27798 --- /dev/null +++ b/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py @@ -0,0 +1,57 @@ +import types + +from dfm.src.megatron.data.wan import wan_energon_datamodule as wan_dm_mod +from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder + + +class _FakeDiffusionDataModule: + def __init__( + self, + *, + path: str, + seq_length: int, + packing_buffer_size: int, + task_encoder, + micro_batch_size: int, + global_batch_size: int, + num_workers: int, + ): + self.path = path + self.seq_length = seq_length + self.packing_buffer_size = packing_buffer_size + self.task_encoder = task_encoder + self.micro_batch_size = micro_batch_size + self.global_batch_size = global_batch_size + self.num_workers = num_workers + + # mimic API used by WanDataModuleConfig.build_datasets + def train_dataloader(self): + return "train" + + +def test_wan_datamodule_config_initialization(monkeypatch): + # Patch the symbol used inside wan_energon_datamodule module + monkeypatch.setattr(wan_dm_mod, "DiffusionDataModule", _FakeDiffusionDataModule) + + cfg = wan_dm_mod.WanDataModuleConfig( + path="", + seq_length=128, + task_encoder_seq_length=128, + packing_buffer_size=4, + micro_batch_size=2, + global_batch_size=8, + num_workers=0, + ) + + # __post_init__ should construct a dataset with WanTaskEncoder and propagate seq_length + assert isinstance(cfg.dataset, _FakeDiffusionDataModule) + assert cfg.sequence_length == cfg.dataset.seq_length == 128 + assert isinstance(cfg.dataset.task_encoder, WanTaskEncoder) + assert cfg.dataset.task_encoder.seq_length == 128 + assert cfg.dataset.task_encoder.packing_buffer_size == 4 + + # build_datasets should return train loader thrice + train, val, test = cfg.build_datasets(context=None) + assert train == "train" and val == "train" and test == "train" + + diff --git a/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py b/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py new file mode 100644 index 00000000..d8f10d74 --- /dev/null +++ b/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py @@ -0,0 +1,54 @@ +import torch + +from torch.utils.data import DataLoader + +from dfm.src.megatron.data.wan.wan_mock_datamodule import WanMockDataModuleConfig + + +def test_wan_mock_datamodule_build_and_batch_shapes(): + cfg = WanMockDataModuleConfig( + path="", + seq_length=128, + packing_buffer_size=2, + micro_batch_size=2, + global_batch_size=8, + num_workers=0, + # Use small shapes for a light-weight test run + F_latents=4, + H_latents=8, + W_latents=6, + patch_spatial=2, + patch_temporal=1, + number_packed_samples=2, + context_seq_len=16, + context_embeddings_dim=64, + ) + train_dl, val_dl, test_dl = cfg.build_datasets(_context=None) + assert isinstance(train_dl, DataLoader) + assert train_dl is val_dl and val_dl is test_dl + + batch = next(iter(train_dl)) + expected_keys = { + "video_latents", + "context_embeddings", + "loss_mask", + "seq_len_q", + "seq_len_q_padded", + "seq_len_kv", + "seq_len_kv_padded", + "grid_sizes", + "video_metadata", + } + assert expected_keys.issubset(set(batch.keys())) + + # Basic sanity checks on shapes/dtypes + assert batch["video_latents"].dim() == 3 and batch["video_latents"].shape[1] == 1 + assert batch["context_embeddings"].dim() == 3 and batch["context_embeddings"].shape[1] == 1 + assert batch["loss_mask"].dim() == 2 and batch["loss_mask"].shape[1] == 1 + assert batch["seq_len_q"].dtype == torch.int32 + assert batch["seq_len_q_padded"].dtype == torch.int32 + assert batch["seq_len_kv"].dtype == torch.int32 + assert batch["seq_len_kv_padded"].dtype == torch.int32 + assert batch["grid_sizes"].dtype == torch.int32 + + diff --git a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py new file mode 100644 index 00000000..2de5173d --- /dev/null +++ b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py @@ -0,0 +1,135 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch + +from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder, cook, parallel_state + + +def test_cook_extracts_expected_fields(): + sample = { + "__key__": "k", + "__restore_key__": "rk", + "__subflavors__": [], + "json": {"meta": 1}, + "pth": torch.randn(1, 2, 2, 2), + "pickle": torch.randn(3, 4), + "unused": 123, + } + out = cook(sample) + assert "json" in out and out["json"] is sample["json"] + assert "pth" in out and torch.equal(out["pth"], sample["pth"]) + assert "pickle" in out and torch.equal(out["pickle"], sample["pickle"]) + # ensure basic keys from the sample are preserved by cook via basic_sample_keys() + assert out["__key__"] == sample["__key__"] + assert out["__restore_key__"] == sample["__restore_key__"] + assert out["__subflavors__"] == sample["__subflavors__"] + + +def test_encode_sample_no_context_parallel(monkeypatch): + # Ensure CP world size is 1 to avoid extra padding branch + monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False) + # Ensure seeded wrapper has an active worker config + from megatron.energon.task_encoder.base import WorkerConfig + class _FakeWorkerCfg: + def worker_seed(self): return 123 + active_worker_sample_index = 0 + monkeypatch.setattr(WorkerConfig, "active_worker_config", _FakeWorkerCfg(), raising=False) + + # Construct a minimal, consistent sample + c = 8 + F_latents, H_latents, W_latents = 4, 8, 6 + patch_temporal, patch_spatial = 1, 2 + # video latent before patchify has shape [c, F_latents, H_latents, W_latents] + # where grid sizes (patch counts) are (F_latents // pF, H_latents // pH, W_latents // pW) + video_latent = torch.randn(c, F_latents, H_latents, W_latents) + context_len, context_dim = 256, 64 + context_embeddings = torch.randn(context_len, context_dim) + sample = { + "__key__": "k", + "__restore_key__": "rk", + "__subflavors__": [], + "json": {"meta": 1}, + "pth": video_latent, + "pickle": context_embeddings, + } + + enc = WanTaskEncoder(seq_length=1024, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=None) + out = enc.encode_sample(sample) + + # Grid / patches + F_patches = F_latents // patch_temporal + H_patches = H_latents // patch_spatial + W_patches = W_latents // patch_spatial + num_patches = F_patches * H_patches * W_patches + patch_vec_dim = c * patch_temporal * patch_spatial * patch_spatial + + assert out.video.shape == (num_patches, patch_vec_dim) + assert out.latent_shape.dtype == torch.int32 + assert torch.equal(out.latent_shape, torch.tensor([F_patches, H_patches, W_patches], dtype=torch.int32)) + + # Loss mask and seq lengths + assert out.loss_mask.dtype == torch.bfloat16 + assert out.loss_mask.shape[0] == num_patches + assert torch.equal(out.seq_len_q, torch.tensor([num_patches], dtype=torch.int32)) + # context embeddings are padded to fixed 512 inside encode_sample + assert torch.equal(out.seq_len_kv, torch.tensor([512], dtype=torch.int32)) + assert torch.equal(out.seq_len_q_padded, out.seq_len_q) + assert torch.equal(out.seq_len_kv_padded, out.seq_len_kv) + + # Metadata passthrough + assert out.video_metadata == sample["json"] + assert out.__key__ == sample["__key__"] + assert out.__restore_key__ == sample["__restore_key__"] + assert out.__subflavors__ == sample["__subflavors__"] + + +def test_batch_with_packing_buffer_size(monkeypatch): + # Force CP world size 1 + monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False) + # Ensure seeded wrapper has an active worker config + from megatron.energon.task_encoder.base import WorkerConfig + class _FakeWorkerCfg: + def worker_seed(self): return 456 + active_worker_sample_index = 0 + monkeypatch.setattr(WorkerConfig, "active_worker_config", _FakeWorkerCfg(), raising=False) + + c = 4 + F_latents, H_latents, W_latents = 2, 4, 4 + patch_temporal, patch_spatial = 1, 2 + video_latent = torch.randn(c, F_latents * patch_temporal, H_latents * patch_spatial, W_latents * patch_spatial) + sample = { + "__key__": "k", + "__restore_key__": "rk", + "__subflavors__": [], + "json": {"meta": 1}, + "pth": video_latent, + "pickle": torch.randn(32, 128), + } + + enc = WanTaskEncoder(seq_length=256, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=3) + diff_sample = enc.encode_sample(sample) + batch = enc.batch([diff_sample]) + + assert isinstance(batch, dict) + for k in ["video_latents", "context_embeddings", "loss_mask", "seq_len_q", "seq_len_q_padded", "seq_len_kv", "seq_len_kv_padded", "grid_sizes", "video_metadata"]: + assert k in batch + + # video_latents: [S, 1, ...], where S equals sample.video length when CP world size is 1 + assert batch["video_latents"].shape[1] == 1 + assert batch["context_embeddings"].shape[1] == 1 + assert batch["loss_mask"].shape[1] == 1 + + diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py new file mode 100644 index 00000000..4110c8e3 --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py @@ -0,0 +1,71 @@ +import os + +import pytest +import torch + +from dfm.src.megatron.model.wan.flow_matching.flow_inference_pipeline import FlowInferencePipeline + + +def test_select_checkpoint_dir_latest(tmp_path): + base = tmp_path / "ckpts" + os.makedirs(base / "iter_0000100") + os.makedirs(base / "iter_0000200") + + # Minimal inference config object + class _Cfg: + num_train_timesteps = 1000 + param_dtype = torch.float32 + text_len = 512 + t5_dtype = torch.float32 + vae_stride = (1, 1, 1) + patch_size = (1, 1, 1) + + # Instantiate object without running heavy init by patching __init__ to a no-op + pip = object.__new__(FlowInferencePipeline) + pip.inference_cfg = _Cfg() + + latest = FlowInferencePipeline._select_checkpoint_dir(pip, str(base), checkpoint_step=None) + assert latest.endswith("iter_0000200") + + specific = FlowInferencePipeline._select_checkpoint_dir(pip, str(base), checkpoint_step=100) + assert specific.endswith("iter_0000100") + + with pytest.raises(FileNotFoundError): + FlowInferencePipeline._select_checkpoint_dir(pip, str(base), checkpoint_step=999) + + +def test_forward_pp_step_no_pp(monkeypatch): + # Build a minimal instance skipping heavy init + pip = object.__new__(FlowInferencePipeline) + class _Model: + class _Cfg: + hidden_size = 16 + qkv_format = "sbhd" + config = _Cfg() + def __call__(self, x, grid_sizes, t, **kwargs): + return x # echo input + def set_input_tensor(self, x): + pass + pip.model = _Model() + + # Patch parallel state to no-PP path + from megatron.core import parallel_state + monkeypatch.setattr(parallel_state, "get_pipeline_model_parallel_world_size", lambda: 1, raising=False) + + S, B, H = 8, 1, pip.model.config.hidden_size + latent_model_input = torch.randn(S, B, H, dtype=torch.float32) + grid_sizes = [(2, 2, 2)] + timestep = torch.tensor([10.0], dtype=torch.float32) + arg_c = {} + + out = FlowInferencePipeline.forward_pp_step( + pip, + latent_model_input=latent_model_input, + grid_sizes=grid_sizes, + max_video_seq_len=S, + timestep=timestep, + arg_c=arg_c, + ) + assert out.shape == latent_model_input.shape + + diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py new file mode 100644 index 00000000..1fcc3ba4 --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py @@ -0,0 +1,104 @@ +import types +import torch + +from dfm.src.megatron.model.wan.flow_matching import flow_pipeline as flow_pipeline_mod +from dfm.src.megatron.model.wan.flow_matching.flow_pipeline import FlowPipeline + + +class _DummyModel: + class _Cfg: + in_channels = 2 + patch_spatial = 1 + patch_temporal = 1 + + def __init__(self): + self.config = self._Cfg() + + def __call__(self, x, grid_sizes, t, context, packed_seq_params): + # Return zeros matching input shape (seq_len, 1, latent_dim) + return torch.zeros_like(x) + + +def test_flow_pipeline_training_step_cpu_stub(monkeypatch): + # Bypass heavy diffusers init + def _stub_init(self, model_id="x", seed=0): + self.pipe = types.SimpleNamespace(scheduler=types.SimpleNamespace(config=types.SimpleNamespace(num_train_timesteps=1000))) + monkeypatch.setattr(FlowPipeline, "__init__", _stub_init) + + # Make patchify accept both tensor and list for this test + def _safe_patchify(x, patch_size): + # Always delegate to the real implementation in utils to avoid recursion + from dfm.src.megatron.model.wan import utils as wan_utils + impl = wan_utils.patchify + # Normalize inputs to expected 4D [C, F, H, W] without batch dim + if isinstance(x, list): + x_norm = [] + for t in x: + if isinstance(t, torch.Tensor) and t.dim() == 5 and t.size(0) == 1: + x_norm.append(t.squeeze(0)) + else: + x_norm.append(t) + else: + t = x + if isinstance(t, torch.Tensor) and t.dim() == 5 and t.size(0) == 1: + t = t.squeeze(0) + x_norm = [t] + return impl(x_norm, patch_size) + monkeypatch.setattr(flow_pipeline_mod, "patchify", _safe_patchify) + + # Disable context parallelism and force last pipeline stage + from megatron.core import parallel_state + monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False) + monkeypatch.setattr(parallel_state, "is_pipeline_last_stage", lambda: True, raising=False) + + pipe = FlowPipeline() + model = _DummyModel() + + # Build a minimal, consistent batch: seq_len = F*H*W = 2*2*2 = 8, latent_dim = in_channels * pF * pH * pW = 2 + F, H, W = 2, 2, 2 + seq_len = F * H * W + latent_dim = model.config.in_channels + + video_latents = torch.randn(seq_len, 1, latent_dim, dtype=torch.float32) + context_embeddings = torch.randn(4, 1, 8, dtype=torch.float32) + loss_mask = torch.ones(seq_len, dtype=torch.bfloat16) + grid_sizes = torch.tensor([[F, H, W]], dtype=torch.int32) + + # Packed seq params with simple cumulative lengths + from megatron.core.packed_seq_params import PackedSeqParams + cu = torch.tensor([0, seq_len], dtype=torch.int32) + packed_seq_params = { + "self_attention": PackedSeqParams( + cu_seqlens_q=cu, cu_seqlens_q_padded=cu, cu_seqlens_kv=cu, cu_seqlens_kv_padded=cu, qkv_format="sbhd" + ), + "cross_attention": PackedSeqParams( + cu_seqlens_q=cu, cu_seqlens_q_padded=cu, cu_seqlens_kv=cu, cu_seqlens_kv_padded=cu, qkv_format="sbhd" + ), + } + + batch = { + "video_latents": video_latents, + "context_embeddings": context_embeddings, + "loss_mask": loss_mask, + "grid_sizes": grid_sizes, + "packed_seq_params": packed_seq_params, + "video_metadata": {}, + } + + model_pred, weighted_loss, split_loss_mask = pipe.training_step( + model, + batch, + use_sigma_noise=True, + timestep_sampling="uniform", + flow_shift=3.0, + mix_uniform_ratio=1.0, # force uniform branch + sigma_min=0.0, + sigma_max=1.0, + ) + + # Basic shape checks + assert model_pred.shape == video_latents.shape + assert weighted_loss.shape[:2] == video_latents.shape[:2] + assert split_loss_mask.shape == loss_mask.shape + + diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py new file mode 100644 index 00000000..977fb55c --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py @@ -0,0 +1,54 @@ +import torch + +from dfm.src.megatron.model.wan.flow_matching.time_shift_utils import ( + time_shift, + compute_density_for_timestep_sampling, + get_flow_match_loss_weight, +) + + +def test_time_shift_constant_linear_sqrt_bounds_and_monotonic(): + t_small = torch.tensor(0.1, dtype=torch.float32) + t_large = torch.tensor(0.9, dtype=torch.float32) + seq_len = 512 + + # constant + s_small = time_shift(t_small, image_seq_len=seq_len, shift_type="constant", constant=3.0) + s_large = time_shift(t_large, image_seq_len=seq_len, shift_type="constant", constant=3.0) + assert 0.0 <= s_small.item() <= 1.0 + assert 0.0 <= s_large.item() <= 1.0 + assert s_large > s_small + + # linear + s_small = time_shift(t_small, image_seq_len=seq_len, shift_type="linear", base_shift=0.5, max_shift=1.15) + s_large = time_shift(t_large, image_seq_len=seq_len, shift_type="linear", base_shift=0.5, max_shift=1.15) + assert 0.0 <= s_small.item() <= 1.0 + assert 0.0 <= s_large.item() <= 1.0 + assert s_large > s_small + + # sqrt + s_small = time_shift(t_small, image_seq_len=seq_len, shift_type="sqrt") + s_large = time_shift(t_large, image_seq_len=seq_len, shift_type="sqrt") + assert 0.0 <= s_small.item() <= 1.0 + assert 0.0 <= s_large.item() <= 1.0 + assert s_large > s_small + + +def test_compute_density_for_timestep_sampling_modes_and_ranges(): + batch_size = 16 + for mode in ["uniform", "logit_normal", "mode"]: + u = compute_density_for_timestep_sampling(mode, batch_size=batch_size, logit_mean=0.0, logit_std=1.0) + assert u.shape == (batch_size,) + assert torch.all((0.0 <= u) & (u <= 1.0)) + + +def test_get_flow_match_loss_weight_simple_cases(): + sigma = torch.zeros(5, dtype=torch.float32) + w = get_flow_match_loss_weight(sigma, shift=3.0) + assert torch.allclose(w, torch.ones_like(w)) + + sigma = torch.ones(5, dtype=torch.float32) + w = get_flow_match_loss_weight(sigma, shift=2.0) + assert torch.allclose(w, torch.full_like(sigma, 3.0)) + + diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py new file mode 100644 index 00000000..1a0eae39 --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py @@ -0,0 +1,29 @@ +from dfm.src.megatron.model.wan.inference import SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES + + +def test_size_configs_structure_and_values(): + assert isinstance(SIZE_CONFIGS, dict) + for key, val in SIZE_CONFIGS.items(): + assert isinstance(key, str) + assert isinstance(val, tuple) and len(val) == 2 + w, h = val + assert isinstance(w, int) and isinstance(h, int) + assert w > 0 and h > 0 + + +def test_max_area_configs_consistency(): + for size_key, area in MAX_AREA_CONFIGS.items(): + w, h = SIZE_CONFIGS[size_key] + assert area == w * h + + +def test_supported_sizes_lists(): + assert "t2v-14B" in SUPPORTED_SIZES + assert "t2v-1.3B" in SUPPORTED_SIZES + for model_key, sizes in SUPPORTED_SIZES.items(): + assert isinstance(sizes, tuple) + for s in sizes: + assert s in SIZE_CONFIGS + + + diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py new file mode 100644 index 00000000..b0389bbd --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py @@ -0,0 +1,65 @@ +import os +import tempfile + +import argparse +import torch + +from dfm.src.megatron.model.wan.inference import utils as inf_utils + + +def test_str2bool_variants_and_errors(): + true_vals = ["yes", "true", "t", "y", "1", "TRUE", "Yes"] + false_vals = ["no", "false", "f", "n", "0", "FALSE", "No"] + for v in true_vals: + assert inf_utils.str2bool(v) is True + for v in false_vals: + assert inf_utils.str2bool(v) is False + assert inf_utils.str2bool(True) is True + assert inf_utils.str2bool(False) is False + try: + inf_utils.str2bool("maybe") + except argparse.ArgumentTypeError: + pass + else: + assert False, "Expected argparse.ArgumentTypeError for invalid boolean string" + + +def test_cache_image_writes_file(tmp_path): + # Small 3x8x8 image + img = torch.rand(3, 8, 8) + out_path = tmp_path / "test.png" + saved = inf_utils.cache_image(img, str(out_path), nrow=1, normalize=False, value_range=(0.0, 1.0), retry=1) + assert saved == str(out_path) + assert os.path.exists(out_path) + assert os.path.getsize(out_path) > 0 + + +def test_cache_video_uses_writer_and_returns_path(monkeypatch): + # Stub imageio.get_writer to avoid codec dependency + calls = {"frames": 0, "path": None} + + class _DummyWriter: + def __init__(self, path, fps=None, codec=None, quality=None): + calls["path"] = path + def append_data(self, frame): + calls["frames"] += 1 + def close(self): + pass + + monkeypatch.setattr(inf_utils.imageio, "get_writer", lambda path, fps, codec, quality: _DummyWriter(path, fps, codec, quality)) + + # Stub make_grid to return a fixed CHW tensor regardless of input + def _fake_make_grid(x, nrow, normalize, value_range): + return torch.rand(3, 4, 5) + monkeypatch.setattr(inf_utils.torchvision.utils, "make_grid", _fake_make_grid) + + # Build a tensor whose unbind(2) yields 2 slices so we expect 2 frames written + vid = torch.rand(3, 3, 2, 2) # shape chosen to exercise unbind(2) + with tempfile.TemporaryDirectory() as td: + out_file = os.path.join(td, "out.mp4") + result = inf_utils.cache_video(vid, save_file=out_file, fps=5, suffix=".mp4", nrow=1, normalize=False, value_range=(0.0, 1.0), retry=1) + assert result == out_file + assert calls["path"] == out_file + assert calls["frames"] == vid.shape[2] # frames equal to number of unbinds on dim=2 + + diff --git a/tests/unit_tests/megatron/model/wan/test_rope_utils.py b/tests/unit_tests/megatron/model/wan/test_rope_utils.py new file mode 100644 index 00000000..be7935b8 --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/test_rope_utils.py @@ -0,0 +1,38 @@ +import pytest +import torch + +from dfm.src.megatron.model.wan.rope_utils import Wan3DRopeEmbeddings + + +def test_wan3d_rope_embeddings_shapes_and_padding(): + # Small, CPU-friendly config + n_head = 2 + dim_head = 8 # must be divisible with the internal splits + max_position_len = 16 + rope = Wan3DRopeEmbeddings(dim_head=dim_head, max_position_len=max_position_len) + + # Two samples with different (f, h, w) + grid_sizes = torch.tensor([[2, 3, 2], [4, 1, 1]], dtype=torch.int32) + seq_lens = [(2 * 3 * 2), (4 * 1 * 1)] + padded_lens = [seq_lens[0] + 2, seq_lens[1]] # pad first sample + + cu_seqlens_q_padded = torch.tensor([0, padded_lens[0], padded_lens[0] + padded_lens[1]], dtype=torch.int32) + + out = rope( + n_head=n_head, + dim_head=dim_head, + cu_seqlens_q_padded=cu_seqlens_q_padded, + grid_sizes=grid_sizes, + device=torch.device("cpu"), + ) + + # Total concatenated length equals sum of padded lens + assert out.shape == (sum(padded_lens), 1, 1, dim_head) + + # Check that padding region for the first sample is zero + first_seq_len = seq_lens[0] + first_padded_len = padded_lens[0] + tail = out[first_seq_len:first_padded_len] + assert torch.all(tail == 0), "Padded region should be zeros" + + diff --git a/tests/unit_tests/megatron/model/wan/test_utils.py b/tests/unit_tests/megatron/model/wan/test_utils.py new file mode 100644 index 00000000..6ee48c75 --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/test_utils.py @@ -0,0 +1,36 @@ +import torch + +from dfm.src.megatron.model.wan.utils import grid_sizes_calculation, patchify, unpatchify + + +def test_grid_sizes_calculation_basic(): + input_shape = (4, 8, 6) + patch_size = (1, 2, 3) + f, h, w = grid_sizes_calculation(input_shape, patch_size) + assert (f, h, w) == (4, 4, 2) + + +def test_patchify_unpatchify_roundtrip(): + # Video latent: [c, F_patches * pF, H_patches * pH, W_patches * pW] + c = 3 + F_patches, H_patches, W_patches = 2, 2, 3 + patch_size = (1, 2, 2) + F_latents = F_patches * patch_size[0] + H_latents = H_patches * patch_size[1] + W_latents = W_patches * patch_size[2] + + x = [torch.randn(c, F_latents, H_latents, W_latents)] + + patches = patchify(x, patch_size) + assert isinstance(patches, list) and len(patches) == 1 + seq_len, dim = patches[0].shape + assert seq_len == F_patches * H_patches * W_patches + assert dim == c * (patch_size[0] * patch_size[1] * patch_size[2]) + + # Unpatchify and compare + y = unpatchify(patches, [[F_patches, H_patches, W_patches]], out_dim=c, patch_size=patch_size) + assert isinstance(y, list) and len(y) == 1 + assert y[0].shape == x[0].shape + torch.testing.assert_close(y[0], x[0], rtol=1e-5, atol=1e-5) + + diff --git a/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py b/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py new file mode 100644 index 00000000..de327a56 --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py @@ -0,0 +1,14 @@ +from dfm.src.megatron.model.wan.wan_layer_spec import get_wan_block_with_transformer_engine_spec + + +def test_get_wan_block_with_transformer_engine_spec_basic(): + spec = get_wan_block_with_transformer_engine_spec() + # Basic structure checks + assert hasattr(spec, "module") + assert hasattr(spec, "submodules") + sub = spec.submodules + # Expected submodule fields exist + for name in ["norm1", "norm2", "norm3", "full_self_attention", "cross_attention", "mlp"]: + assert hasattr(sub, name), f"Missing submodule {name}" + + diff --git a/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py b/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py new file mode 100644 index 00000000..09b65729 --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py @@ -0,0 +1,13 @@ +import torch + +from dfm.src.megatron.model.wan.wan_model import sinusoidal_embedding_1d + + +def test_sinusoidal_embedding_1d_shape_and_dtype(): + dim = 16 + pos = torch.arange(10, dtype=torch.float32) + emb = sinusoidal_embedding_1d(dim, pos) + assert emb.shape == (pos.shape[0], dim) + assert emb.dtype == torch.float32 + + diff --git a/tests/unit_tests/megatron/model/wan/test_wan_provider.py b/tests/unit_tests/megatron/model/wan/test_wan_provider.py new file mode 100644 index 00000000..ea337aa7 --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/test_wan_provider.py @@ -0,0 +1,55 @@ +import torch + +from dfm.src.megatron.model.wan.wan_provider import WanModelProvider +from dfm.src.megatron.model.wan.wan_model import WanModel +from megatron.core import parallel_state + + +def test_wan_model_provider_provide_returns_model(monkeypatch): + # Force pipeline stage booleans to avoid dependency on initialized model parallel + monkeypatch.setattr(parallel_state, "is_pipeline_first_stage", lambda: True, raising=False) + monkeypatch.setattr(parallel_state, "is_pipeline_last_stage", lambda: True, raising=False) + # Avoid querying uninitialized PP groups + monkeypatch.setattr(parallel_state, "get_pipeline_model_parallel_world_size", lambda: 1, raising=False) + + provider = WanModelProvider( + num_layers=2, # keep small + hidden_size=64, + ffn_hidden_size=128, + num_attention_heads=4, + layernorm_epsilon=1e-6, + normalization="RMSNorm", + layernorm_zero_centered_gamma=False, + layernorm_across_heads=True, + add_qkv_bias=True, + rotary_interleaved=True, + hidden_dropout=0.0, + attention_dropout=0.0, + fp16_lm_cross_entropy=False, + parallel_output=True, + bf16=False, + params_dtype=torch.float32, + qkv_format="sbhd", + seq_length=128, + share_embeddings_and_output_weights=False, + vocab_size=32000, + make_vocab_size_divisible_by=128, + in_channels=4, + out_channels=4, + patch_spatial=2, + patch_temporal=1, + freq_dim=16, + text_len=32, + text_dim=64, + ) + # Ensure config supplies fields expected by core attention + provider.kv_channels = provider.hidden_size // provider.num_attention_heads + provider.num_query_groups = provider.num_attention_heads + model = provider.provide() + assert isinstance(model, WanModel) + # Sanity check key config properties were plumbed + assert model.config.hidden_size == 64 + assert model.config.num_attention_heads == 4 + assert model.config.text_dim == 64 + + diff --git a/tests/unit_tests/megatron/model/wan/test_wan_step.py b/tests/unit_tests/megatron/model/wan/test_wan_step.py new file mode 100644 index 00000000..5c04f3db --- /dev/null +++ b/tests/unit_tests/megatron/model/wan/test_wan_step.py @@ -0,0 +1,50 @@ +import pytest +import torch + +from dfm.src.megatron.model.wan.wan_step import wan_data_step, WanForwardStep + + +class _DummyIter: + def __init__(self, batch): + # mimic attribute used inside wan_data_step + self.iterable = [batch] + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="wan_data_step moves tensors to CUDA") +def test_wan_data_step_builds_packed_seq_params_cuda_guarded(): + # Construct minimal batch with required seq_len fields + batch = { + "seq_len_q": torch.tensor([3, 5], dtype=torch.int32), + "seq_len_q_padded": torch.tensor([4, 6], dtype=torch.int32), + "seq_len_kv": torch.tensor([2, 7], dtype=torch.int32), + "seq_len_kv_padded": torch.tensor([2, 8], dtype=torch.int32), + # include a tensor field to exercise device transfer + "video_latents": torch.randn(8, 1, 4, dtype=torch.float32), + } + it = _DummyIter(batch) + qkv_format = "sbhd" + out = wan_data_step(qkv_format, it) + + assert "packed_seq_params" in out + for k in ["self_attention", "cross_attention"]: + assert k in out["packed_seq_params"] + p = out["packed_seq_params"][k] + assert hasattr(p, "cu_seqlens_q") + assert hasattr(p, "cu_seqlens_q_padded") + assert hasattr(p, "cu_seqlens_kv") + assert hasattr(p, "cu_seqlens_kv_padded") + # spot-check CUDA device after move + assert out["video_latents"].is_cuda + + +def test_wan_forward_step_loss_partial_creation(): + step = WanForwardStep() + mask = torch.ones(4, dtype=torch.float32) + loss_fn = step._create_loss_function(mask, check_for_nan_in_loss=False, check_for_spiky_loss=False) + # Just validate it's callable and is a functools.partial + import functools + + assert isinstance(loss_fn, functools.partial) + assert callable(loss_fn) + + From 93f94bcfc0d692b6433f9f544e5287f86f14e65d Mon Sep 17 00:00:00 2001 From: Huy Vu2 Date: Fri, 14 Nov 2025 08:22:51 -0800 Subject: [PATCH 02/20] ruff lint --- .../data/wan/test_wan_energon_datamodule.py | 16 ++++++++++++--- .../data/wan/test_wan_mock_datamodule.py | 17 +++++++++++++--- .../megatron/data/wan/test_wan_taskencoder.py | 7 +++++-- .../test_flow_inference_pipeline.py | 14 +++++++++++++ .../wan/flow_matching/test_flow_pipeline.py | 15 ++++++++++++++ .../flow_matching/test_time_shift_utils.py | 16 ++++++++++++++- .../wan/inference/test_inference_init.py | 16 ++++++++++++++- .../wan/inference/test_inference_utils.py | 16 ++++++++++++++- .../megatron/model/wan/test_rope_utils.py | 15 +++++++++++++- .../megatron/model/wan/test_utils.py | 16 +++++++++++++-- .../megatron/model/wan/test_wan_layer_spec.py | 16 +++++++++++++-- .../megatron/model/wan/test_wan_model_misc.py | 16 +++++++++++++-- .../megatron/model/wan/test_wan_provider.py | 20 +++++++++++++++---- .../megatron/model/wan/test_wan_step.py | 18 ++++++++++++++--- 14 files changed, 193 insertions(+), 25 deletions(-) diff --git a/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py b/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py index 6fb27798..c4dc6014 100644 --- a/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py +++ b/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py @@ -1,4 +1,16 @@ -import types +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from dfm.src.megatron.data.wan import wan_energon_datamodule as wan_dm_mod from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder @@ -53,5 +65,3 @@ def test_wan_datamodule_config_initialization(monkeypatch): # build_datasets should return train loader thrice train, val, test = cfg.build_datasets(context=None) assert train == "train" and val == "train" and test == "train" - - diff --git a/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py b/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py index d8f10d74..e1980052 100644 --- a/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py +++ b/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py @@ -1,5 +1,18 @@ -import torch +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch from torch.utils.data import DataLoader from dfm.src.megatron.data.wan.wan_mock_datamodule import WanMockDataModuleConfig @@ -50,5 +63,3 @@ def test_wan_mock_datamodule_build_and_batch_shapes(): assert batch["seq_len_kv"].dtype == torch.int32 assert batch["seq_len_kv_padded"].dtype == torch.int32 assert batch["grid_sizes"].dtype == torch.int32 - - diff --git a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py index 2de5173d..c4ed63b3 100644 --- a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py +++ b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest import torch from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder, cook, parallel_state @@ -43,9 +42,13 @@ def test_encode_sample_no_context_parallel(monkeypatch): monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False) # Ensure seeded wrapper has an active worker config from megatron.energon.task_encoder.base import WorkerConfig + class _FakeWorkerCfg: - def worker_seed(self): return 123 + def worker_seed(self): + return 123 + active_worker_sample_index = 0 + monkeypatch.setattr(WorkerConfig, "active_worker_config", _FakeWorkerCfg(), raising=False) # Construct a minimal, consistent sample diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py index 4110c8e3..52e19708 100644 --- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import pytest diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py index 1fcc3ba4..8511068e 100644 --- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py @@ -1,4 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import types + import torch from dfm.src.megatron.model.wan.flow_matching import flow_pipeline as flow_pipeline_mod diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py index 977fb55c..9739c626 100644 --- a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py @@ -1,9 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch from dfm.src.megatron.model.wan.flow_matching.time_shift_utils import ( - time_shift, compute_density_for_timestep_sampling, get_flow_match_loss_weight, + time_shift, ) diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py index 1a0eae39..514857fd 100644 --- a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py +++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py @@ -1,4 +1,18 @@ -from dfm.src.megatron.model.wan.inference import SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dfm.src.megatron.model.wan.inference import MAX_AREA_CONFIGS, SIZE_CONFIGS, SUPPORTED_SIZES def test_size_configs_structure_and_values(): diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py index b0389bbd..022cddce 100644 --- a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py +++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py @@ -1,7 +1,21 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse import os import tempfile -import argparse import torch from dfm.src.megatron.model.wan.inference import utils as inf_utils diff --git a/tests/unit_tests/megatron/model/wan/test_rope_utils.py b/tests/unit_tests/megatron/model/wan/test_rope_utils.py index be7935b8..e9451c64 100644 --- a/tests/unit_tests/megatron/model/wan/test_rope_utils.py +++ b/tests/unit_tests/megatron/model/wan/test_rope_utils.py @@ -1,4 +1,17 @@ -import pytest +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch from dfm.src.megatron.model.wan.rope_utils import Wan3DRopeEmbeddings diff --git a/tests/unit_tests/megatron/model/wan/test_utils.py b/tests/unit_tests/megatron/model/wan/test_utils.py index 6ee48c75..3f89b4cd 100644 --- a/tests/unit_tests/megatron/model/wan/test_utils.py +++ b/tests/unit_tests/megatron/model/wan/test_utils.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch from dfm.src.megatron.model.wan.utils import grid_sizes_calculation, patchify, unpatchify @@ -32,5 +46,3 @@ def test_patchify_unpatchify_roundtrip(): assert isinstance(y, list) and len(y) == 1 assert y[0].shape == x[0].shape torch.testing.assert_close(y[0], x[0], rtol=1e-5, atol=1e-5) - - diff --git a/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py b/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py index de327a56..21ee570e 100644 --- a/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py +++ b/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from dfm.src.megatron.model.wan.wan_layer_spec import get_wan_block_with_transformer_engine_spec @@ -10,5 +24,3 @@ def test_get_wan_block_with_transformer_engine_spec_basic(): # Expected submodule fields exist for name in ["norm1", "norm2", "norm3", "full_self_attention", "cross_attention", "mlp"]: assert hasattr(sub, name), f"Missing submodule {name}" - - diff --git a/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py b/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py index 09b65729..de141def 100644 --- a/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py +++ b/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch from dfm.src.megatron.model.wan.wan_model import sinusoidal_embedding_1d @@ -9,5 +23,3 @@ def test_sinusoidal_embedding_1d_shape_and_dtype(): emb = sinusoidal_embedding_1d(dim, pos) assert emb.shape == (pos.shape[0], dim) assert emb.dtype == torch.float32 - - diff --git a/tests/unit_tests/megatron/model/wan/test_wan_provider.py b/tests/unit_tests/megatron/model/wan/test_wan_provider.py index ea337aa7..bd451ff7 100644 --- a/tests/unit_tests/megatron/model/wan/test_wan_provider.py +++ b/tests/unit_tests/megatron/model/wan/test_wan_provider.py @@ -1,8 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch +from megatron.core import parallel_state -from dfm.src.megatron.model.wan.wan_provider import WanModelProvider from dfm.src.megatron.model.wan.wan_model import WanModel -from megatron.core import parallel_state +from dfm.src.megatron.model.wan.wan_provider import WanModelProvider def test_wan_model_provider_provide_returns_model(monkeypatch): @@ -51,5 +65,3 @@ def test_wan_model_provider_provide_returns_model(monkeypatch): assert model.config.hidden_size == 64 assert model.config.num_attention_heads == 4 assert model.config.text_dim == 64 - - diff --git a/tests/unit_tests/megatron/model/wan/test_wan_step.py b/tests/unit_tests/megatron/model/wan/test_wan_step.py index 5c04f3db..8ee0e9cb 100644 --- a/tests/unit_tests/megatron/model/wan/test_wan_step.py +++ b/tests/unit_tests/megatron/model/wan/test_wan_step.py @@ -1,7 +1,21 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest import torch -from dfm.src.megatron.model.wan.wan_step import wan_data_step, WanForwardStep +from dfm.src.megatron.model.wan.wan_step import WanForwardStep, wan_data_step class _DummyIter: @@ -46,5 +60,3 @@ def test_wan_forward_step_loss_partial_creation(): assert isinstance(loss_fn, functools.partial) assert callable(loss_fn) - - From 244ee74ee6d93f2710bbb8fb835a985f6ff0b570 Mon Sep 17 00:00:00 2001 From: Huy Vu2 Date: Fri, 14 Nov 2025 08:34:06 -0800 Subject: [PATCH 03/20] ruff lint --- .../megatron/data/wan/test_wan_taskencoder.py | 29 +++++++++++++++---- .../test_flow_inference_pipeline.py | 8 +++-- .../wan/flow_matching/test_flow_pipeline.py | 8 +++-- .../wan/inference/test_inference_utils.py | 13 ++++++--- .../megatron/model/wan/test_rope_utils.py | 2 -- 5 files changed, 43 insertions(+), 17 deletions(-) diff --git a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py index c4ed63b3..d190ef30 100644 --- a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py +++ b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py @@ -69,7 +69,10 @@ def worker_seed(self): "pickle": context_embeddings, } - enc = WanTaskEncoder(seq_length=1024, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=None) + + enc = WanTaskEncoder( + seq_length=1024, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=None + ) out = enc.encode_sample(sample) # Grid / patches @@ -104,9 +107,13 @@ def test_batch_with_packing_buffer_size(monkeypatch): monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False) # Ensure seeded wrapper has an active worker config from megatron.energon.task_encoder.base import WorkerConfig + class _FakeWorkerCfg: - def worker_seed(self): return 456 + def worker_seed(self): + return 456 + active_worker_sample_index = 0 + monkeypatch.setattr(WorkerConfig, "active_worker_config", _FakeWorkerCfg(), raising=False) c = 4 @@ -122,17 +129,27 @@ def worker_seed(self): return 456 "pickle": torch.randn(32, 128), } - enc = WanTaskEncoder(seq_length=256, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=3) + enc = WanTaskEncoder( + seq_length=256, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=3 + ) diff_sample = enc.encode_sample(sample) batch = enc.batch([diff_sample]) assert isinstance(batch, dict) - for k in ["video_latents", "context_embeddings", "loss_mask", "seq_len_q", "seq_len_q_padded", "seq_len_kv", "seq_len_kv_padded", "grid_sizes", "video_metadata"]: + for k in [ + "video_latents", + "context_embeddings", + "loss_mask", + "seq_len_q", + "seq_len_q_padded", + "seq_len_kv", + "seq_len_kv_padded", + "grid_sizes", + "video_metadata", + ]: assert k in batch # video_latents: [S, 1, ...], where S equals sample.video length when CP world size is 1 assert batch["video_latents"].shape[1] == 1 assert batch["context_embeddings"].shape[1] == 1 assert batch["loss_mask"].shape[1] == 1 - - diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py index 52e19708..82471a8e 100644 --- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py @@ -36,6 +36,7 @@ class _Cfg: # Instantiate object without running heavy init by patching __init__ to a no-op pip = object.__new__(FlowInferencePipeline) + pip.inference_cfg = _Cfg() latest = FlowInferencePipeline._select_checkpoint_dir(pip, str(base), checkpoint_step=None) @@ -55,15 +56,20 @@ class _Model: class _Cfg: hidden_size = 16 qkv_format = "sbhd" + config = _Cfg() + def __call__(self, x, grid_sizes, t, **kwargs): return x # echo input + def set_input_tensor(self, x): pass + pip.model = _Model() # Patch parallel state to no-PP path from megatron.core import parallel_state + monkeypatch.setattr(parallel_state, "get_pipeline_model_parallel_world_size", lambda: 1, raising=False) S, B, H = 8, 1, pip.model.config.hidden_size @@ -81,5 +87,3 @@ def set_input_tensor(self, x): arg_c=arg_c, ) assert out.shape == latent_model_input.shape - - diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py index 8511068e..70fa1fc1 100644 --- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py @@ -37,13 +37,17 @@ def __call__(self, x, grid_sizes, t, context, packed_seq_params): def test_flow_pipeline_training_step_cpu_stub(monkeypatch): # Bypass heavy diffusers init def _stub_init(self, model_id="x", seed=0): - self.pipe = types.SimpleNamespace(scheduler=types.SimpleNamespace(config=types.SimpleNamespace(num_train_timesteps=1000))) + self.pipe = types.SimpleNamespace( + scheduler=types.SimpleNamespace(config=types.SimpleNamespace(num_train_timesteps=1000)) + ) + monkeypatch.setattr(FlowPipeline, "__init__", _stub_init) # Make patchify accept both tensor and list for this test def _safe_patchify(x, patch_size): # Always delegate to the real implementation in utils to avoid recursion from dfm.src.megatron.model.wan import utils as wan_utils + impl = wan_utils.patchify # Normalize inputs to expected 4D [C, F, H, W] without batch dim if isinstance(x, list): @@ -115,5 +119,3 @@ def _safe_patchify(x, patch_size): assert model_pred.shape == video_latents.shape assert weighted_loss.shape[:2] == video_latents.shape[:2] assert split_loss_mask.shape == loss_mask.shape - - diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py index 022cddce..8445fddd 100644 --- a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py +++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py @@ -55,25 +55,30 @@ def test_cache_video_uses_writer_and_returns_path(monkeypatch): class _DummyWriter: def __init__(self, path, fps=None, codec=None, quality=None): calls["path"] = path + def append_data(self, frame): calls["frames"] += 1 + def close(self): pass - monkeypatch.setattr(inf_utils.imageio, "get_writer", lambda path, fps, codec, quality: _DummyWriter(path, fps, codec, quality)) + monkeypatch.setattr( + inf_utils.imageio, "get_writer", lambda path, fps, codec, quality: _DummyWriter(path, fps, codec, quality) + ) # Stub make_grid to return a fixed CHW tensor regardless of input def _fake_make_grid(x, nrow, normalize, value_range): return torch.rand(3, 4, 5) + monkeypatch.setattr(inf_utils.torchvision.utils, "make_grid", _fake_make_grid) # Build a tensor whose unbind(2) yields 2 slices so we expect 2 frames written vid = torch.rand(3, 3, 2, 2) # shape chosen to exercise unbind(2) with tempfile.TemporaryDirectory() as td: out_file = os.path.join(td, "out.mp4") - result = inf_utils.cache_video(vid, save_file=out_file, fps=5, suffix=".mp4", nrow=1, normalize=False, value_range=(0.0, 1.0), retry=1) + result = inf_utils.cache_video( + vid, save_file=out_file, fps=5, suffix=".mp4", nrow=1, normalize=False, value_range=(0.0, 1.0), retry=1 + ) assert result == out_file assert calls["path"] == out_file assert calls["frames"] == vid.shape[2] # frames equal to number of unbinds on dim=2 - - diff --git a/tests/unit_tests/megatron/model/wan/test_rope_utils.py b/tests/unit_tests/megatron/model/wan/test_rope_utils.py index e9451c64..7e31d8d0 100644 --- a/tests/unit_tests/megatron/model/wan/test_rope_utils.py +++ b/tests/unit_tests/megatron/model/wan/test_rope_utils.py @@ -47,5 +47,3 @@ def test_wan3d_rope_embeddings_shapes_and_padding(): first_padded_len = padded_lens[0] tail = out[first_seq_len:first_padded_len] assert torch.all(tail == 0), "Padded region should be zeros" - - From 550f283a5a3455e5b33e2d4a43aa124e2c6d0299 Mon Sep 17 00:00:00 2001 From: Huy Vu2 Date: Fri, 14 Nov 2025 08:51:38 -0800 Subject: [PATCH 04/20] ruff lint --- tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py | 1 - .../model/wan/flow_matching/test_flow_inference_pipeline.py | 1 + .../megatron/model/wan/flow_matching/test_flow_pipeline.py | 3 +++ .../megatron/model/wan/flow_matching/test_time_shift_utils.py | 2 -- .../megatron/model/wan/inference/test_inference_init.py | 3 --- 5 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py index d190ef30..e739a1ec 100644 --- a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py +++ b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py @@ -69,7 +69,6 @@ def worker_seed(self): "pickle": context_embeddings, } - enc = WanTaskEncoder( seq_length=1024, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=None ) diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py index 82471a8e..95e2cfca 100644 --- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py @@ -52,6 +52,7 @@ class _Cfg: def test_forward_pp_step_no_pp(monkeypatch): # Build a minimal instance skipping heavy init pip = object.__new__(FlowInferencePipeline) + class _Model: class _Cfg: hidden_size = 16 diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py index 70fa1fc1..d93a4298 100644 --- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py @@ -63,10 +63,12 @@ def _safe_patchify(x, patch_size): t = t.squeeze(0) x_norm = [t] return impl(x_norm, patch_size) + monkeypatch.setattr(flow_pipeline_mod, "patchify", _safe_patchify) # Disable context parallelism and force last pipeline stage from megatron.core import parallel_state + monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False) monkeypatch.setattr(parallel_state, "is_pipeline_last_stage", lambda: True, raising=False) @@ -85,6 +87,7 @@ def _safe_patchify(x, patch_size): # Packed seq params with simple cumulative lengths from megatron.core.packed_seq_params import PackedSeqParams + cu = torch.tensor([0, seq_len], dtype=torch.int32) packed_seq_params = { "self_attention": PackedSeqParams( diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py index 9739c626..b239584c 100644 --- a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py +++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py @@ -64,5 +64,3 @@ def test_get_flow_match_loss_weight_simple_cases(): sigma = torch.ones(5, dtype=torch.float32) w = get_flow_match_loss_weight(sigma, shift=2.0) assert torch.allclose(w, torch.full_like(sigma, 3.0)) - - diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py index 514857fd..f8005047 100644 --- a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py +++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py @@ -38,6 +38,3 @@ def test_supported_sizes_lists(): assert isinstance(sizes, tuple) for s in sizes: assert s in SIZE_CONFIGS - - - From 926a9513a36c2773077262eaa9fb465e9207ab9a Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 13:19:06 -0800 Subject: [PATCH 05/20] Explicit mcore path override to use Megatron-Bridge's pinned submodule commit Signed-off-by: Pablo Garay --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 56e934d9..05a40a68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,6 +132,7 @@ explicit = true [tool.uv.sources] nemo-automodel = { path = "3rdparty/Automodel" } megatron-bridge = { path = "3rdparty/Megatron-Bridge" } +megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" } transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } nvidia-resiliency-ext = { index = "pypi" } From ecdef9e80256ff5d2e42508d09847c4978b2be22 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 14:04:03 -0800 Subject: [PATCH 06/20] Update Megatron-Bridge submodule to latest main with correct Megatron-LM commit (3cbe5c68) Signed-off-by: Pablo Garay --- 3rdparty/Megatron-Bridge | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge index 8e21f81a..4e4ce420 160000 --- a/3rdparty/Megatron-Bridge +++ b/3rdparty/Megatron-Bridge @@ -1 +1 @@ -Subproject commit 8e21f81ab961bdb0ad99a275074fe50aae15d2f9 +Subproject commit 4e4ce4203589466d0a5b846e12dd24fa74c57f2a From 50385183f621feb8351d6bd7888249d235baa8ec Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 14:25:20 -0800 Subject: [PATCH 07/20] Add Mcore WAN pretrain mock test to CI/CD Signed-off-by: Pablo Garay --- .github/workflows/cicd-main.yml | 3 + .../L2_Mcore_Mock_Tests_GPU.sh | 15 +++ .../test_mcore_wan_pretrain.py | 109 ++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh create mode 100644 tests/functional_tests/test_mcore_wan_pretrain.py diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b3f08665..7f792d47 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -92,6 +92,9 @@ jobs: - script: L2_Functional_Tests_GPU runner: linux-amd64-gpu-rtxa6000-latest-2-nemo timeout: 30 + - script: L2_Mcore_Mock_Tests_GPU + runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + timeout: 30 needs: [cicd-unit-tests] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh new file mode 100644 index 00000000..871b9e6a --- /dev/null +++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v + diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py new file mode 100644 index 00000000..b19836af --- /dev/null +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -0,0 +1,109 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functional smoke tests for Mcore WAN pretrain mock runs.""" + +import os +import subprocess +import tempfile + +import pytest + + +class TestMcoreWanPretrain: + """Test class for Mcore WAN pretrain functional tests.""" + + @pytest.mark.run_only_on("GPU") + def test_wan_pretrain_mock(self, tmp_path): + """ + Functional test for WAN pretrain recipe with mock data. + + This test verifies that the WAN pretrain recipe can run successfully + in mock mode with minimal configuration, ensuring: + 1. The distributed training can start without errors + 2. Model initialization works correctly + 3. Forward/backward passes complete successfully + 4. The training loop executes without crashes + """ + # Set up temporary directories for dataset and checkpoints + dataset_path = os.path.join(tmp_path, "mock_dataset") + checkpoint_dir = os.path.join(tmp_path, "checkpoints") + os.makedirs(dataset_path, exist_ok=True) + os.makedirs(checkpoint_dir, exist_ok=True) + + # Build the command for the mock run + cmd = [ + "python", + "-m", + "torch.distributed.run", + "--nproc_per_node=1", + "examples/megatron/recipes/wan/pretrain_wan.py", + "--training-mode", + "pretrain", + "model.tensor_model_parallel_size=1", + "model.pipeline_model_parallel_size=1", + "model.context_parallel_size=1", + "model.crossattn_emb_size=1536", + "model.hidden_size=1536", + "model.ffn_hidden_size=8960", + "model.num_attention_heads=12", + "model.num_layers=3", + "model.qkv_format=thd", + f"dataset.path={dataset_path}", + f"checkpoint.save={checkpoint_dir}", + f"checkpoint.load={checkpoint_dir}", + "checkpoint.load_optim=false", + "checkpoint.save_interval=200", + "optimizer.lr=5e-6", + "optimizer.min_lr=5e-6", + "train.eval_iters=0", + "scheduler.lr_decay_style=constant", + "scheduler.lr_warmup_iters=0", + "model.seq_length=2048", + "dataset.seq_length=2048", + "train.global_batch_size=2", + "train.micro_batch_size=1", + "dataset.global_batch_size=2", + "dataset.micro_batch_size=1", + "logger.log_interval=1", + "--mock", + ] + + # Run the command with a timeout + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + check=True, + ) + + # Print output for debugging if needed + print("STDOUT:", result.stdout) + print("STDERR:", result.stderr) + + # Basic verification that the run completed + assert result.returncode == 0, f"Command failed with return code {result.returncode}" + + # Check for common success indicators in output + assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), ( + "Expected to see iteration progress in output" + ) + + except subprocess.TimeoutExpired: + pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds") + except subprocess.CalledProcessError as e: + pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}") + From c746d180d102a415412e28bed4e27eff8ce40347 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 14:30:53 -0800 Subject: [PATCH 08/20] lintfix Signed-off-by: Pablo Garay --- tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh | 1 - tests/functional_tests/test_mcore_wan_pretrain.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh index 871b9e6a..2e99db05 100644 --- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh +++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh @@ -12,4 +12,3 @@ # See the License for the specific language governing permissions and # limitations under the License. CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v - diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index b19836af..780bb253 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -16,7 +16,6 @@ import os import subprocess -import tempfile import pytest @@ -106,4 +105,3 @@ def test_wan_pretrain_mock(self, tmp_path): pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds") except subprocess.CalledProcessError as e: pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}") - From 697201d23eb0417cf5cbb126a679719a35ed7f79 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 14 Nov 2025 17:16:42 -0800 Subject: [PATCH 09/20] Fix slow Docker build from Megatron-LM source Signed-off-by: Pablo Garay --- docker/Dockerfile.ci | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 7096b3c6..8de9c016 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -32,7 +32,19 @@ RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages # Copy dependency files and source code (needed for dynamic version resolution) COPY pyproject.toml uv.lock ./ COPY dfm ./dfm -COPY 3rdparty ./3rdparty + +# Copy 3rdparty dependencies with minimal files for metadata resolution +# Copy Automodel +COPY 3rdparty/Automodel ./3rdparty/Automodel + +# Copy Megatron-Bridge +COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/ +COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src + +# Copy minimal Megatron-LM files for metadata (prevents full source build) +COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/ +COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/ +COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/ # Install dependencies in two steps: # 1. Install build dependencies first (required for packages with no-build-isolation) From 013ca6da5d4291401218b2dafe8ce4ebc7dd88bc Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sat, 15 Nov 2025 21:33:27 -0600 Subject: [PATCH 10/20] ci: Update gpu runners to use self-hosted-nemo (#48) * ci: Update gpu runners to use self-hosted-nemo Signed-off-by: Charlie Truong * Use uv run in test_mcore_wan_pretrain Signed-off-by: Charlie Truong * Ensure uv group megatron-bridge is used for test_mcore_wan_pretrain Signed-off-by: Charlie Truong * Update TRANSFORMERS_OFFLINE environment variable to 0 and increase timeout in test_mcore_wan_pretrain * Update TRANSFORMERS_OFFLINE environment variable to 0 and increase timeout in test_mcore_wan_pretrain Signed-off-by: Charlie Truong * Revert GHA changes Signed-off-by: Charlie Truong * Move uv run group call to L2_Mcore_Mock_Tests_GPU Signed-off-by: Charlie Truong * Set test back to 5 minute timeout Signed-off-by: Charlie Truong * Megatron fixes (#49) * Enhance DiT and Wan layer specifications - Updated `get_query_key_value_tensors` method in `dit_attention.py` to include an `output_gate` parameter and set `split_qkv` to default to `True`. - Modified `WanLayerWithAdaLN` class in `wan_layer_spec.py` to add `rotary_pos_cos_sin` parameter for improved positional encoding handling. * Implement ProcessGroupCollection initialization in DiT and Wan models - Added initialization of `pg_collection` in both `DiTCrossAttentionModel` and `WanModel` to ensure proper handling of process groups. - This change checks if `pg_collection` exists and is not None before assigning it, enhancing the robustness of the models. * Update CONTRIBUTING.md to include detailed setup instructions for development environment and Docker container usage. Added sections for building and running the container, as well as setting the PYTHONPATH for DFM. * Refactor import statements in dit_model.py to streamline dependencies. Removed redundant import of ProcessGroupCollection, enhancing code clarity and maintainability. * Refactor code style in DiT and Wan models - Updated string quotes in `dit_model.py` and `wan_model.py` for consistency, changing from single to double quotes. - Reformatted the `get_query_key_value_tensors` method call in `dit_attention.py` for improved readability by breaking it into multiple lines. * Revert M4 changes * Ruff * Ruff * Lint --------- Co-authored-by: Abhinav Garg * Revert "Revert GHA changes" This reverts commit d7ad1ab48b4d5f2fb00f1a51c84320228c1f64f3. * tempfortest: timeout setting Signed-off-by: Pablo Garay * workflow dispatch Signed-off-by: Pablo Garay * update Signed-off-by: Pablo Garay * add logging Signed-off-by: Pablo Garay * Update test configuration for Mcore WAN pretraining - Increased the number of processes per node from 1 to 2 for distributed training. - Set the number of training iterations to 10 to enhance the training process. * More changes * Lint --------- Signed-off-by: Charlie Truong Signed-off-by: Pablo Garay Co-authored-by: Abhinav Garg Co-authored-by: Pablo Garay Signed-off-by: Pablo Garay --- .github/actions/test-template/action.yml | 2 +- .github/workflows/cicd-main.yml | 80 ++++++++++--------- CONTRIBUTING.md | 28 +++++++ .../megatron/model/common/dit_attention.py | 8 +- dfm/src/megatron/model/dit/dit_model.py | 1 - dfm/src/megatron/model/wan/wan_layer_spec.py | 1 + .../L2_Mcore_Mock_Tests_GPU.sh | 2 +- .../test_mcore_wan_pretrain.py | 15 ++-- 8 files changed, 83 insertions(+), 54 deletions(-) diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml index ccf10a80..ef0e5446 100644 --- a/.github/actions/test-template/action.yml +++ b/.github/actions/test-template/action.yml @@ -161,7 +161,7 @@ runs: -d \ --name nemo_container_${{ github.run_id }} ${ARG[@]} \ --shm-size=64g \ - --env TRANSFORMERS_OFFLINE=1 \ + --env TRANSFORMERS_OFFLINE=0 \ --env HYDRA_FULL_ERROR=1 \ --env HF_HOME=/home/TestData/HF_HOME \ --env RUN_ID=${{ github.run_id }} \ diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 7f792d47..fc8b2c25 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -13,6 +13,7 @@ # limitations under the License. name: CICD NeMo on: + workflow_dispatch: schedule: - cron: 0 0 * * * push: @@ -45,57 +46,58 @@ jobs: with: image-name: dfm dockerfile: docker/Dockerfile.ci + runner: self-hosted-nemo secrets: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - cicd-unit-tests: - strategy: - fail-fast: false - matrix: - include: - - script: L0_Unit_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo - timeout: 30 - - script: L0_Unit_Tests_CPU - runner: linux-amd64-cpu16 - cpu-only: true - needs: [cicd-container-build] - runs-on: ${{ matrix.runner }} - name: ${{ matrix.script }} - environment: nemo-ci - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - submodules: recursive - - name: main - uses: ./.github/actions/test-template - with: - runner: ${{ runner.name }} - script: ${{ matrix.script }} - timeout: ${{ matrix.timeout || 10 }} - is_unit_test: "true" - image: dfm - cpu-only: ${{ matrix.cpu-only || false }} - has-azure-credentials: "true" - azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + # cicd-unit-tests: + # strategy: + # fail-fast: false + # matrix: + # include: + # - script: L0_Unit_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 + # - script: L0_Unit_Tests_CPU + # runner: linux-amd64-cpu16 + # cpu-only: true + # needs: [cicd-container-build] + # runs-on: ${{ matrix.runner }} + # name: ${{ matrix.script }} + # environment: nemo-ci + # steps: + # - name: Checkout + # uses: actions/checkout@v4 + # with: + # submodules: recursive + # - name: main + # uses: ./.github/actions/test-template + # with: + # runner: ${{ runner.name }} + # script: ${{ matrix.script }} + # timeout: ${{ matrix.timeout || 10 }} + # is_unit_test: "true" + # image: dfm + # cpu-only: ${{ matrix.cpu-only || false }} + # has-azure-credentials: "true" + # azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + # azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + # azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} cicd-e2e-tests: strategy: fail-fast: false matrix: include: - - script: L2_Functional_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo - timeout: 30 + # - script: L2_Functional_Tests_GPU + # runner: self-hosted-nemo + # timeout: 30 - script: L2_Mcore_Mock_Tests_GPU - runner: linux-amd64-gpu-rtxa6000-latest-2-nemo + runner: self-hosted-nemo timeout: 30 - needs: [cicd-unit-tests] + needs: [cicd-container-build] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} environment: nemo-ci @@ -120,7 +122,7 @@ jobs: Nemo_CICD_Test: needs: - cicd-container-build - - cicd-unit-tests + # - cicd-unit-tests - cicd-e2e-tests if: always() runs-on: ubuntu-latest diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 68ab66d4..aed9cf99 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,32 @@ # Contributing To NeMo DFM +## 🛠️ Setting Up Your Environment + +Use the instructions below to setup a dev environment and a dev container + +### Building a container +```bash +# We recommend you to get the latest commits for Megatron-Bridge and Autmodel +# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands +git submodule update --init --recursive --remote # Get all the 3rd party submodules +cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong +# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty +git checkout +cd ../../../../ +docker build -f docker/Dockerfile.ci -t dfm:latest . +``` + +### Run the container +```bash +docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash +``` + +### inside the container +```bash +# Add DFM to PYTHONPATH +export PYTHONPATH=$PYTHONPATH:/opt/DFM + +# Run a Mock Run: +``` ## Signing Your Work diff --git a/dfm/src/megatron/model/common/dit_attention.py b/dfm/src/megatron/model/common/dit_attention.py index 321e9b08..acf39d47 100644 --- a/dfm/src/megatron/model/common/dit_attention.py +++ b/dfm/src/megatron/model/common/dit_attention.py @@ -100,7 +100,7 @@ def __init__( else: self.k_layernorm = None - def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False): + def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True): """ Derives `query`, `key` and `value` tensors from `hidden_states`. """ @@ -251,13 +251,15 @@ def __init__( is_expert=False, ) - def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False): + def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True): """ Derives `query` tensor from `hidden_states`, and `key`/`value` tensors from `key_value_states`. """ - query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states) + query, key, value = super().get_query_key_value_tensors( + hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv + ) # gather query and key heads across TP ranks if self.layernorm_across_heads is True if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1: diff --git a/dfm/src/megatron/model/dit/dit_model.py b/dfm/src/megatron/model/dit/dit_model.py index e3ae8a29..38cb8422 100644 --- a/dfm/src/megatron/model/dit/dit_model.py +++ b/dfm/src/megatron/model/dit/dit_model.py @@ -105,7 +105,6 @@ def __init__( super(DiTCrossAttentionModel, self).__init__(config=config) self.config: TransformerConfig = config - self.transformer_decoder_layer_spec = transformer_decoder_layer_spec() self.pre_process = pre_process self.post_process = post_process diff --git a/dfm/src/megatron/model/wan/wan_layer_spec.py b/dfm/src/megatron/model/wan/wan_layer_spec.py index 2b355930..a0d6354e 100644 --- a/dfm/src/megatron/model/wan/wan_layer_spec.py +++ b/dfm/src/megatron/model/wan/wan_layer_spec.py @@ -162,6 +162,7 @@ def forward( packed_seq_params=None, sequence_len_offset=None, inference_context=None, + rotary_pos_cos_sin=None, ): # the timestep embedding is stored in attention_mask argument timestep_emb = attention_mask diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh index 2e99db05..b8d237a1 100644 --- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh +++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v +CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py index 780bb253..0c9879d9 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/test_mcore_wan_pretrain.py @@ -46,7 +46,7 @@ def test_wan_pretrain_mock(self, tmp_path): "python", "-m", "torch.distributed.run", - "--nproc_per_node=1", + "--nproc_per_node=2", "examples/megatron/recipes/wan/pretrain_wan.py", "--training-mode", "pretrain", @@ -67,6 +67,7 @@ def test_wan_pretrain_mock(self, tmp_path): "optimizer.lr=5e-6", "optimizer.min_lr=5e-6", "train.eval_iters=0", + "train.train_iters=10", "scheduler.lr_decay_style=constant", "scheduler.lr_warmup_iters=0", "model.seq_length=2048", @@ -81,11 +82,12 @@ def test_wan_pretrain_mock(self, tmp_path): # Run the command with a timeout try: + # Stream output in real-time instead of capturing it result = subprocess.run( cmd, capture_output=True, text=True, - timeout=300, # 5 minute timeout + timeout=1800, # 30 minute timeout check=True, ) @@ -96,12 +98,7 @@ def test_wan_pretrain_mock(self, tmp_path): # Basic verification that the run completed assert result.returncode == 0, f"Command failed with return code {result.returncode}" - # Check for common success indicators in output - assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), ( - "Expected to see iteration progress in output" - ) - except subprocess.TimeoutExpired: - pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds") + pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)") except subprocess.CalledProcessError as e: - pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}") + pytest.fail(f"WAN pretrain mock run failed with return code {e.returncode}") From f240ccd896288bde0f543188b24e7247bae34e43 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 21:37:03 -0800 Subject: [PATCH 11/20] Reapply "Revert GHA changes" This reverts commit fdb911f729d2870e96266e34b7592819140ff2e7. Signed-off-by: Pablo Garay --- .github/workflows/cicd-main.yml | 76 ++++++++++++++++----------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index fc8b2c25..962e3d6c 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -52,52 +52,52 @@ jobs: AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - # cicd-unit-tests: - # strategy: - # fail-fast: false - # matrix: - # include: - # - script: L0_Unit_Tests_GPU - # runner: self-hosted-nemo - # timeout: 30 - # - script: L0_Unit_Tests_CPU - # runner: linux-amd64-cpu16 - # cpu-only: true - # needs: [cicd-container-build] - # runs-on: ${{ matrix.runner }} - # name: ${{ matrix.script }} - # environment: nemo-ci - # steps: - # - name: Checkout - # uses: actions/checkout@v4 - # with: - # submodules: recursive - # - name: main - # uses: ./.github/actions/test-template - # with: - # runner: ${{ runner.name }} - # script: ${{ matrix.script }} - # timeout: ${{ matrix.timeout || 10 }} - # is_unit_test: "true" - # image: dfm - # cpu-only: ${{ matrix.cpu-only || false }} - # has-azure-credentials: "true" - # azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - # azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - # azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + cicd-unit-tests: + strategy: + fail-fast: false + matrix: + include: + - script: L0_Unit_Tests_GPU + runner: self-hosted-nemo + timeout: 30 + - script: L0_Unit_Tests_CPU + runner: linux-amd64-cpu16 + cpu-only: true + needs: [cicd-container-build] + runs-on: ${{ matrix.runner }} + name: ${{ matrix.script }} + environment: nemo-ci + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + - name: main + uses: ./.github/actions/test-template + with: + runner: ${{ runner.name }} + script: ${{ matrix.script }} + timeout: ${{ matrix.timeout || 10 }} + is_unit_test: "true" + image: dfm + cpu-only: ${{ matrix.cpu-only || false }} + has-azure-credentials: "true" + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} cicd-e2e-tests: strategy: fail-fast: false matrix: include: - # - script: L2_Functional_Tests_GPU - # runner: self-hosted-nemo - # timeout: 30 + - script: L2_Functional_Tests_GPU + runner: self-hosted-nemo + timeout: 30 - script: L2_Mcore_Mock_Tests_GPU runner: self-hosted-nemo timeout: 30 - needs: [cicd-container-build] + needs: [cicd-unit-tests] runs-on: ${{ matrix.runner }} name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }} environment: nemo-ci @@ -122,7 +122,7 @@ jobs: Nemo_CICD_Test: needs: - cicd-container-build - # - cicd-unit-tests + - cicd-unit-tests - cicd-e2e-tests if: always() runs-on: ubuntu-latest From 0964c625eff90ec2b637e59f090835731e924c76 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 21:43:54 -0800 Subject: [PATCH 12/20] update path per request Signed-off-by: Pablo Garay --- tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh | 2 +- .../recipes/test_wan_pretrain.py} | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) rename tests/functional_tests/{test_mcore_wan_pretrain.py => mcore/recipes/test_wan_pretrain.py} (99%) diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh index b8d237a1..7977af26 100644 --- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh +++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v +CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/mcore/recipes/test_wan_pretrain.py -m "not pleasefixme" --with_downloads -v diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/mcore/recipes/test_wan_pretrain.py similarity index 99% rename from tests/functional_tests/test_mcore_wan_pretrain.py rename to tests/functional_tests/mcore/recipes/test_wan_pretrain.py index 0c9879d9..ca68bd1c 100644 --- a/tests/functional_tests/test_mcore_wan_pretrain.py +++ b/tests/functional_tests/mcore/recipes/test_wan_pretrain.py @@ -102,3 +102,4 @@ def test_wan_pretrain_mock(self, tmp_path): pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)") except subprocess.CalledProcessError as e: pytest.fail(f"WAN pretrain mock run failed with return code {e.returncode}") + From d08b5af04c6a2e96f0281c3cc2dca3b2997cc6e8 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 21:47:31 -0800 Subject: [PATCH 13/20] lintfix Signed-off-by: Pablo Garay --- tests/functional_tests/mcore/recipes/test_wan_pretrain.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/functional_tests/mcore/recipes/test_wan_pretrain.py b/tests/functional_tests/mcore/recipes/test_wan_pretrain.py index ca68bd1c..0c9879d9 100644 --- a/tests/functional_tests/mcore/recipes/test_wan_pretrain.py +++ b/tests/functional_tests/mcore/recipes/test_wan_pretrain.py @@ -102,4 +102,3 @@ def test_wan_pretrain_mock(self, tmp_path): pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)") except subprocess.CalledProcessError as e: pytest.fail(f"WAN pretrain mock run failed with return code {e.returncode}") - From eebe7318021e66119e3047919fc344fc10603415 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 15 Nov 2025 22:32:59 -0800 Subject: [PATCH 14/20] update CONTRIBUTING.md Signed-off-by: Pablo Garay --- CONTRIBUTING.md | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index aed9cf99..1b34c164 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,19 +5,16 @@ Use the instructions below to setup a dev environment and a dev container ### Building a container ```bash -# We recommend you to get the latest commits for Megatron-Bridge and Autmodel -# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands -git submodule update --init --recursive --remote # Get all the 3rd party submodules -cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong -# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty -git checkout -cd ../../../../ +# Initialize all submodules (Megatron-Bridge, Automodel, and nested Megatron-LM) +git submodule update --init --recursive + +# Build the container docker build -f docker/Dockerfile.ci -t dfm:latest . ``` ### Run the container ```bash -docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash +docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all -v $(pwd):/opt/DFM -it dfm:latest bash ``` ### inside the container @@ -30,6 +27,8 @@ export PYTHONPATH=$PYTHONPATH:/opt/DFM ## Signing Your Work +### Sign-Off (Required) + * We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license. * Any contribution which contains commits that are not Signed-Off will not be accepted. @@ -43,6 +42,32 @@ export PYTHONPATH=$PYTHONPATH:/opt/DFM Signed-off-by: Your Name ``` +### Commit Verification (Recommended) + +* We recommend signing your commits with SSH or GPG to get the "Verified" badge on GitHub. + +* **SSH Signing (Easiest):** + ```bash + # Configure SSH signing + git config --global gpg.format ssh + git config --global user.signingkey ~/.ssh/id_rsa.pub # or id_ed25519.pub + git config --global commit.gpgsign true + + # Add your SSH key as a "Signing Key" on GitHub: https://github.com/settings/keys + ``` + +* **GPG Signing (Alternative):** + ```bash + # Generate a GPG key + gpg --full-generate-key + + # Configure GPG signing + git config --global user.signingkey YOUR_GPG_KEY_ID + git config --global commit.gpgsign true + + # Add your GPG key to GitHub: https://github.com/settings/keys + ``` + * Full text of the DCO: ``` From 6685a540d9dd8b087d268c857be32208677ccb97 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sun, 16 Nov 2025 00:01:56 -0800 Subject: [PATCH 15/20] lintfix Signed-off-by: Pablo Garay --- CONTRIBUTING.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1b34c164..bb807bd7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -52,7 +52,7 @@ export PYTHONPATH=$PYTHONPATH:/opt/DFM git config --global gpg.format ssh git config --global user.signingkey ~/.ssh/id_rsa.pub # or id_ed25519.pub git config --global commit.gpgsign true - + # Add your SSH key as a "Signing Key" on GitHub: https://github.com/settings/keys ``` @@ -60,11 +60,11 @@ export PYTHONPATH=$PYTHONPATH:/opt/DFM ```bash # Generate a GPG key gpg --full-generate-key - + # Configure GPG signing git config --global user.signingkey YOUR_GPG_KEY_ID git config --global commit.gpgsign true - + # Add your GPG key to GitHub: https://github.com/settings/keys ``` From a5a109a7a4712eb7b526a0ba28ed84ab661ab1e9 Mon Sep 17 00:00:00 2001 From: Huy Vu2 Date: Sun, 16 Nov 2025 11:31:08 -0800 Subject: [PATCH 16/20] adding v run --group megatron-bridge --- tests/unit_tests/L0_Unit_Tests_CPU.sh | 2 +- tests/unit_tests/L0_Unit_Tests_GPU.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/L0_Unit_Tests_CPU.sh b/tests/unit_tests/L0_Unit_Tests_CPU.sh index 081c1564..dd8c1ee4 100644 --- a/tests/unit_tests/L0_Unit_Tests_CPU.sh +++ b/tests/unit_tests/L0_Unit_Tests_CPU.sh @@ -14,4 +14,4 @@ # Hide GPU from PyTorch by setting CUDA_VISIBLE_DEVICES to empty # This makes torch.cuda.is_available() return False -CUDA_VISIBLE_DEVICES="" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads +CUDA_VISIBLE_DEVICES="" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads diff --git a/tests/unit_tests/L0_Unit_Tests_GPU.sh b/tests/unit_tests/L0_Unit_Tests_GPU.sh index ae77eb3f..0468cab6 100644 --- a/tests/unit_tests/L0_Unit_Tests_GPU.sh +++ b/tests/unit_tests/L0_Unit_Tests_GPU.sh @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads +CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads From f43aea30b1e2c84f56b8db162455a78bd2212108 Mon Sep 17 00:00:00 2001 From: Huy Vu2 Date: Sun, 16 Nov 2025 11:46:42 -0800 Subject: [PATCH 17/20] update test --- .../megatron/model/wan/test_wan_provider.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/unit_tests/megatron/model/wan/test_wan_provider.py b/tests/unit_tests/megatron/model/wan/test_wan_provider.py index bd451ff7..30d65e05 100644 --- a/tests/unit_tests/megatron/model/wan/test_wan_provider.py +++ b/tests/unit_tests/megatron/model/wan/test_wan_provider.py @@ -13,10 +13,12 @@ # limitations under the License. import torch +import torch.nn as nn from megatron.core import parallel_state from dfm.src.megatron.model.wan.wan_model import WanModel from dfm.src.megatron.model.wan.wan_provider import WanModelProvider +import dfm.src.megatron.model.wan.wan_model as wan_model_module def test_wan_model_provider_provide_returns_model(monkeypatch): @@ -26,6 +28,21 @@ def test_wan_model_provider_provide_returns_model(monkeypatch): # Avoid querying uninitialized PP groups monkeypatch.setattr(parallel_state, "get_pipeline_model_parallel_world_size", lambda: 1, raising=False) + # Bypass Megatron's ProcessGroupCollection usage inside TransformerBlock during construction. + # CI does not initialize distributed groups; a dummy block suffices for construction checks. + class DummyTransformerBlock(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self.input_tensor = None + + def set_input_tensor(self, input_tensor): + self.input_tensor = input_tensor + + def forward(self, hidden_states, **kwargs): + return hidden_states + + monkeypatch.setattr(wan_model_module, "TransformerBlock", DummyTransformerBlock, raising=False) + provider = WanModelProvider( num_layers=2, # keep small hidden_size=64, From d2d983f91b26ae8b8457f91f19388d38045ef071 Mon Sep 17 00:00:00 2001 From: Huy Vu2 Date: Sun, 16 Nov 2025 11:51:52 -0800 Subject: [PATCH 18/20] ruff lint --- tests/unit_tests/megatron/model/wan/test_wan_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_tests/megatron/model/wan/test_wan_provider.py b/tests/unit_tests/megatron/model/wan/test_wan_provider.py index 30d65e05..78541900 100644 --- a/tests/unit_tests/megatron/model/wan/test_wan_provider.py +++ b/tests/unit_tests/megatron/model/wan/test_wan_provider.py @@ -16,9 +16,9 @@ import torch.nn as nn from megatron.core import parallel_state +import dfm.src.megatron.model.wan.wan_model as wan_model_module from dfm.src.megatron.model.wan.wan_model import WanModel from dfm.src.megatron.model.wan.wan_provider import WanModelProvider -import dfm.src.megatron.model.wan.wan_model as wan_model_module def test_wan_model_provider_provide_returns_model(monkeypatch): From 166e8097604c6fc7669091ace7c640d2319962bd Mon Sep 17 00:00:00 2001 From: Huy Vu2 Date: Mon, 17 Nov 2025 06:40:15 -0800 Subject: [PATCH 19/20] restore Dockerfile.ci --- docker/Dockerfile.ci | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index fadafa44..cfcc3a74 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -39,7 +39,8 @@ COPY 3rdparty/Automodel ./3rdparty/Automodel # Copy minimal Megatron-Bridge files for metadata (prevents full source build) COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/ -COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src +COPY 3rdparty/Megatron-Bridge/src/megatron/bridge/__init__.py ./3rdparty/Megatron-Bridge/src/megatron/bridge/ +COPY 3rdparty/Megatron-Bridge/src/megatron/bridge/package_info.py ./3rdparty/Megatron-Bridge/src/megatron/bridge/ # Copy minimal Megatron-LM files for metadata (prevents full source build) COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/ From c1bde61dff413cc6f691adf0601c54db86d5b154 Mon Sep 17 00:00:00 2001 From: Huy Vu2 Date: Mon, 17 Nov 2025 08:22:04 -0800 Subject: [PATCH 20/20] update .github/workflows/cicd-main.yml --- .github/workflows/cicd-main.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 962e3d6c..2d1a28b5 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -91,9 +91,6 @@ jobs: fail-fast: false matrix: include: - - script: L2_Functional_Tests_GPU - runner: self-hosted-nemo - timeout: 30 - script: L2_Mcore_Mock_Tests_GPU runner: self-hosted-nemo timeout: 30