From dae57f9d835c954aabd8c5240805c94edf5ae107 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Fri, 14 Nov 2025 08:00:27 -0800
Subject: [PATCH 01/20] adding tests

---
 .../flow_matching/flow_inference_pipeline.py  |   8 +-
 .../data/wan/test_wan_energon_datamodule.py   |  57 ++++++++
 .../data/wan/test_wan_mock_datamodule.py      |  54 +++++++
 .../megatron/data/wan/test_wan_taskencoder.py | 135 ++++++++++++++++++
 .../test_flow_inference_pipeline.py           |  71 +++++++++
 .../wan/flow_matching/test_flow_pipeline.py   | 104 ++++++++++++++
 .../flow_matching/test_time_shift_utils.py    |  54 +++++++
 .../wan/inference/test_inference_init.py      |  29 ++++
 .../wan/inference/test_inference_utils.py     |  65 +++++++++
 .../megatron/model/wan/test_rope_utils.py     |  38 +++++
 .../megatron/model/wan/test_utils.py          |  36 +++++
 .../megatron/model/wan/test_wan_layer_spec.py |  14 ++
 .../megatron/model/wan/test_wan_model_misc.py |  13 ++
 .../megatron/model/wan/test_wan_provider.py   |  55 +++++++
 .../megatron/model/wan/test_wan_step.py       |  50 +++++++
 15 files changed, 779 insertions(+), 4 deletions(-)
 create mode 100644 tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py
 create mode 100644 tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py
 create mode 100644 tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
 create mode 100644 tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
 create mode 100644 tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
 create mode 100644 tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py
 create mode 100644 tests/unit_tests/megatron/model/wan/inference/test_inference_init.py
 create mode 100644 tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py
 create mode 100644 tests/unit_tests/megatron/model/wan/test_rope_utils.py
 create mode 100644 tests/unit_tests/megatron/model/wan/test_utils.py
 create mode 100644 tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py
 create mode 100644 tests/unit_tests/megatron/model/wan/test_wan_model_misc.py
 create mode 100644 tests/unit_tests/megatron/model/wan/test_wan_provider.py
 create mode 100644 tests/unit_tests/megatron/model/wan/test_wan_step.py

diff --git a/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py b/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py
index 2bbb0eb3..459876b2 100644
--- a/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py
+++ b/dfm/src/megatron/model/wan/flow_matching/flow_inference_pipeline.py
@@ -229,13 +229,13 @@ def forward_pp_step(
         """
 
         pp_world_size = parallel_state.get_pipeline_model_parallel_world_size()
-        is_pp_first = parallel_state.is_pipeline_first_stage(ignore_virtual=True)
-        is_pp_last = parallel_state.is_pipeline_last_stage(ignore_virtual=True)
-
-        # PP=1: no pipeline parallelism
+        # PP=1: no pipeline parallelism (avoid touching PP groups which may be uninitialized in unit tests)
         if pp_world_size == 1:
             noise_pred_pp = self.model(latent_model_input, grid_sizes=grid_sizes, t=timestep, **arg_c)
             return noise_pred_pp
+        # For PP>1, safe to query stage information
+        is_pp_first = parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+        is_pp_last = parallel_state.is_pipeline_last_stage(ignore_virtual=True)
 
         # PP>1: pipeline parallelism
         hidden_size = self.model.config.hidden_size
diff --git a/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py b/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py
new file mode 100644
index 00000000..6fb27798
--- /dev/null
+++ b/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py
@@ -0,0 +1,57 @@
+import types
+
+from dfm.src.megatron.data.wan import wan_energon_datamodule as wan_dm_mod
+from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder
+
+
+class _FakeDiffusionDataModule:
+    def __init__(
+        self,
+        *,
+        path: str,
+        seq_length: int,
+        packing_buffer_size: int,
+        task_encoder,
+        micro_batch_size: int,
+        global_batch_size: int,
+        num_workers: int,
+    ):
+        self.path = path
+        self.seq_length = seq_length
+        self.packing_buffer_size = packing_buffer_size
+        self.task_encoder = task_encoder
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.num_workers = num_workers
+
+    # mimic API used by WanDataModuleConfig.build_datasets
+    def train_dataloader(self):
+        return "train"
+
+
+def test_wan_datamodule_config_initialization(monkeypatch):
+    # Patch the symbol used inside wan_energon_datamodule module
+    monkeypatch.setattr(wan_dm_mod, "DiffusionDataModule", _FakeDiffusionDataModule)
+
+    cfg = wan_dm_mod.WanDataModuleConfig(
+        path="",
+        seq_length=128,
+        task_encoder_seq_length=128,
+        packing_buffer_size=4,
+        micro_batch_size=2,
+        global_batch_size=8,
+        num_workers=0,
+    )
+
+    # __post_init__ should construct a dataset with WanTaskEncoder and propagate seq_length
+    assert isinstance(cfg.dataset, _FakeDiffusionDataModule)
+    assert cfg.sequence_length == cfg.dataset.seq_length == 128
+    assert isinstance(cfg.dataset.task_encoder, WanTaskEncoder)
+    assert cfg.dataset.task_encoder.seq_length == 128
+    assert cfg.dataset.task_encoder.packing_buffer_size == 4
+
+    # build_datasets should return train loader thrice
+    train, val, test = cfg.build_datasets(context=None)
+    assert train == "train" and val == "train" and test == "train"
+
+
diff --git a/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py b/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py
new file mode 100644
index 00000000..d8f10d74
--- /dev/null
+++ b/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py
@@ -0,0 +1,54 @@
+import torch
+
+from torch.utils.data import DataLoader
+
+from dfm.src.megatron.data.wan.wan_mock_datamodule import WanMockDataModuleConfig
+
+
+def test_wan_mock_datamodule_build_and_batch_shapes():
+    cfg = WanMockDataModuleConfig(
+        path="",
+        seq_length=128,
+        packing_buffer_size=2,
+        micro_batch_size=2,
+        global_batch_size=8,
+        num_workers=0,
+        # Use small shapes for a light-weight test run
+        F_latents=4,
+        H_latents=8,
+        W_latents=6,
+        patch_spatial=2,
+        patch_temporal=1,
+        number_packed_samples=2,
+        context_seq_len=16,
+        context_embeddings_dim=64,
+    )
+    train_dl, val_dl, test_dl = cfg.build_datasets(_context=None)
+    assert isinstance(train_dl, DataLoader)
+    assert train_dl is val_dl and val_dl is test_dl
+
+    batch = next(iter(train_dl))
+    expected_keys = {
+        "video_latents",
+        "context_embeddings",
+        "loss_mask",
+        "seq_len_q",
+        "seq_len_q_padded",
+        "seq_len_kv",
+        "seq_len_kv_padded",
+        "grid_sizes",
+        "video_metadata",
+    }
+    assert expected_keys.issubset(set(batch.keys()))
+
+    # Basic sanity checks on shapes/dtypes
+    assert batch["video_latents"].dim() == 3 and batch["video_latents"].shape[1] == 1
+    assert batch["context_embeddings"].dim() == 3 and batch["context_embeddings"].shape[1] == 1
+    assert batch["loss_mask"].dim() == 2 and batch["loss_mask"].shape[1] == 1
+    assert batch["seq_len_q"].dtype == torch.int32
+    assert batch["seq_len_q_padded"].dtype == torch.int32
+    assert batch["seq_len_kv"].dtype == torch.int32
+    assert batch["seq_len_kv_padded"].dtype == torch.int32
+    assert batch["grid_sizes"].dtype == torch.int32
+
+
diff --git a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
new file mode 100644
index 00000000..2de5173d
--- /dev/null
+++ b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder, cook, parallel_state
+
+
+def test_cook_extracts_expected_fields():
+    sample = {
+        "__key__": "k",
+        "__restore_key__": "rk",
+        "__subflavors__": [],
+        "json": {"meta": 1},
+        "pth": torch.randn(1, 2, 2, 2),
+        "pickle": torch.randn(3, 4),
+        "unused": 123,
+    }
+    out = cook(sample)
+    assert "json" in out and out["json"] is sample["json"]
+    assert "pth" in out and torch.equal(out["pth"], sample["pth"])
+    assert "pickle" in out and torch.equal(out["pickle"], sample["pickle"])
+    # ensure basic keys from the sample are preserved by cook via basic_sample_keys()
+    assert out["__key__"] == sample["__key__"]
+    assert out["__restore_key__"] == sample["__restore_key__"]
+    assert out["__subflavors__"] == sample["__subflavors__"]
+
+
+def test_encode_sample_no_context_parallel(monkeypatch):
+    # Ensure CP world size is 1 to avoid extra padding branch
+    monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False)
+    # Ensure seeded wrapper has an active worker config
+    from megatron.energon.task_encoder.base import WorkerConfig
+    class _FakeWorkerCfg:
+        def worker_seed(self): return 123
+        active_worker_sample_index = 0
+    monkeypatch.setattr(WorkerConfig, "active_worker_config", _FakeWorkerCfg(), raising=False)
+
+    # Construct a minimal, consistent sample
+    c = 8
+    F_latents, H_latents, W_latents = 4, 8, 6
+    patch_temporal, patch_spatial = 1, 2
+    # video latent before patchify has shape [c, F_latents, H_latents, W_latents]
+    # where grid sizes (patch counts) are (F_latents // pF, H_latents // pH, W_latents // pW)
+    video_latent = torch.randn(c, F_latents, H_latents, W_latents)
+    context_len, context_dim = 256, 64
+    context_embeddings = torch.randn(context_len, context_dim)
+    sample = {
+        "__key__": "k",
+        "__restore_key__": "rk",
+        "__subflavors__": [],
+        "json": {"meta": 1},
+        "pth": video_latent,
+        "pickle": context_embeddings,
+    }
+
+    enc = WanTaskEncoder(seq_length=1024, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=None)
+    out = enc.encode_sample(sample)
+
+    # Grid / patches
+    F_patches = F_latents // patch_temporal
+    H_patches = H_latents // patch_spatial
+    W_patches = W_latents // patch_spatial
+    num_patches = F_patches * H_patches * W_patches
+    patch_vec_dim = c * patch_temporal * patch_spatial * patch_spatial
+
+    assert out.video.shape == (num_patches, patch_vec_dim)
+    assert out.latent_shape.dtype == torch.int32
+    assert torch.equal(out.latent_shape, torch.tensor([F_patches, H_patches, W_patches], dtype=torch.int32))
+
+    # Loss mask and seq lengths
+    assert out.loss_mask.dtype == torch.bfloat16
+    assert out.loss_mask.shape[0] == num_patches
+    assert torch.equal(out.seq_len_q, torch.tensor([num_patches], dtype=torch.int32))
+    # context embeddings are padded to fixed 512 inside encode_sample
+    assert torch.equal(out.seq_len_kv, torch.tensor([512], dtype=torch.int32))
+    assert torch.equal(out.seq_len_q_padded, out.seq_len_q)
+    assert torch.equal(out.seq_len_kv_padded, out.seq_len_kv)
+
+    # Metadata passthrough
+    assert out.video_metadata == sample["json"]
+    assert out.__key__ == sample["__key__"]
+    assert out.__restore_key__ == sample["__restore_key__"]
+    assert out.__subflavors__ == sample["__subflavors__"]
+
+
+def test_batch_with_packing_buffer_size(monkeypatch):
+    # Force CP world size 1
+    monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False)
+    # Ensure seeded wrapper has an active worker config
+    from megatron.energon.task_encoder.base import WorkerConfig
+    class _FakeWorkerCfg:
+        def worker_seed(self): return 456
+        active_worker_sample_index = 0
+    monkeypatch.setattr(WorkerConfig, "active_worker_config", _FakeWorkerCfg(), raising=False)
+
+    c = 4
+    F_latents, H_latents, W_latents = 2, 4, 4
+    patch_temporal, patch_spatial = 1, 2
+    video_latent = torch.randn(c, F_latents * patch_temporal, H_latents * patch_spatial, W_latents * patch_spatial)
+    sample = {
+        "__key__": "k",
+        "__restore_key__": "rk",
+        "__subflavors__": [],
+        "json": {"meta": 1},
+        "pth": video_latent,
+        "pickle": torch.randn(32, 128),
+    }
+
+    enc = WanTaskEncoder(seq_length=256, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=3)
+    diff_sample = enc.encode_sample(sample)
+    batch = enc.batch([diff_sample])
+
+    assert isinstance(batch, dict)
+    for k in ["video_latents", "context_embeddings", "loss_mask", "seq_len_q", "seq_len_q_padded", "seq_len_kv", "seq_len_kv_padded", "grid_sizes", "video_metadata"]:
+        assert k in batch
+
+    # video_latents: [S, 1, ...], where S equals sample.video length when CP world size is 1
+    assert batch["video_latents"].shape[1] == 1
+    assert batch["context_embeddings"].shape[1] == 1
+    assert batch["loss_mask"].shape[1] == 1
+
+
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
new file mode 100644
index 00000000..4110c8e3
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
@@ -0,0 +1,71 @@
+import os
+
+import pytest
+import torch
+
+from dfm.src.megatron.model.wan.flow_matching.flow_inference_pipeline import FlowInferencePipeline
+
+
+def test_select_checkpoint_dir_latest(tmp_path):
+    base = tmp_path / "ckpts"
+    os.makedirs(base / "iter_0000100")
+    os.makedirs(base / "iter_0000200")
+
+    # Minimal inference config object
+    class _Cfg:
+        num_train_timesteps = 1000
+        param_dtype = torch.float32
+        text_len = 512
+        t5_dtype = torch.float32
+        vae_stride = (1, 1, 1)
+        patch_size = (1, 1, 1)
+
+    # Instantiate object without running heavy init by patching __init__ to a no-op
+    pip = object.__new__(FlowInferencePipeline)
+    pip.inference_cfg = _Cfg()
+
+    latest = FlowInferencePipeline._select_checkpoint_dir(pip, str(base), checkpoint_step=None)
+    assert latest.endswith("iter_0000200")
+
+    specific = FlowInferencePipeline._select_checkpoint_dir(pip, str(base), checkpoint_step=100)
+    assert specific.endswith("iter_0000100")
+
+    with pytest.raises(FileNotFoundError):
+        FlowInferencePipeline._select_checkpoint_dir(pip, str(base), checkpoint_step=999)
+
+
+def test_forward_pp_step_no_pp(monkeypatch):
+    # Build a minimal instance skipping heavy init
+    pip = object.__new__(FlowInferencePipeline)
+    class _Model:
+        class _Cfg:
+            hidden_size = 16
+            qkv_format = "sbhd"
+        config = _Cfg()
+        def __call__(self, x, grid_sizes, t, **kwargs):
+            return x  # echo input
+        def set_input_tensor(self, x):
+            pass
+    pip.model = _Model()
+
+    # Patch parallel state to no-PP path
+    from megatron.core import parallel_state
+    monkeypatch.setattr(parallel_state, "get_pipeline_model_parallel_world_size", lambda: 1, raising=False)
+
+    S, B, H = 8, 1, pip.model.config.hidden_size
+    latent_model_input = torch.randn(S, B, H, dtype=torch.float32)
+    grid_sizes = [(2, 2, 2)]
+    timestep = torch.tensor([10.0], dtype=torch.float32)
+    arg_c = {}
+
+    out = FlowInferencePipeline.forward_pp_step(
+        pip,
+        latent_model_input=latent_model_input,
+        grid_sizes=grid_sizes,
+        max_video_seq_len=S,
+        timestep=timestep,
+        arg_c=arg_c,
+    )
+    assert out.shape == latent_model_input.shape
+
+
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
new file mode 100644
index 00000000..1fcc3ba4
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
@@ -0,0 +1,104 @@
+import types
+import torch
+
+from dfm.src.megatron.model.wan.flow_matching import flow_pipeline as flow_pipeline_mod
+from dfm.src.megatron.model.wan.flow_matching.flow_pipeline import FlowPipeline
+
+
+class _DummyModel:
+    class _Cfg:
+        in_channels = 2
+        patch_spatial = 1
+        patch_temporal = 1
+
+    def __init__(self):
+        self.config = self._Cfg()
+
+    def __call__(self, x, grid_sizes, t, context, packed_seq_params):
+        # Return zeros matching input shape (seq_len, 1, latent_dim)
+        return torch.zeros_like(x)
+
+
+def test_flow_pipeline_training_step_cpu_stub(monkeypatch):
+    # Bypass heavy diffusers init
+    def _stub_init(self, model_id="x", seed=0):
+        self.pipe = types.SimpleNamespace(scheduler=types.SimpleNamespace(config=types.SimpleNamespace(num_train_timesteps=1000)))
+    monkeypatch.setattr(FlowPipeline, "__init__", _stub_init)
+
+    # Make patchify accept both tensor and list for this test
+    def _safe_patchify(x, patch_size):
+        # Always delegate to the real implementation in utils to avoid recursion
+        from dfm.src.megatron.model.wan import utils as wan_utils
+        impl = wan_utils.patchify
+        # Normalize inputs to expected 4D [C, F, H, W] without batch dim
+        if isinstance(x, list):
+            x_norm = []
+            for t in x:
+                if isinstance(t, torch.Tensor) and t.dim() == 5 and t.size(0) == 1:
+                    x_norm.append(t.squeeze(0))
+                else:
+                    x_norm.append(t)
+        else:
+            t = x
+            if isinstance(t, torch.Tensor) and t.dim() == 5 and t.size(0) == 1:
+                t = t.squeeze(0)
+            x_norm = [t]
+        return impl(x_norm, patch_size)
+    monkeypatch.setattr(flow_pipeline_mod, "patchify", _safe_patchify)
+
+    # Disable context parallelism and force last pipeline stage
+    from megatron.core import parallel_state
+    monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False)
+    monkeypatch.setattr(parallel_state, "is_pipeline_last_stage", lambda: True, raising=False)
+
+    pipe = FlowPipeline()
+    model = _DummyModel()
+
+    # Build a minimal, consistent batch: seq_len = F*H*W = 2*2*2 = 8, latent_dim = in_channels * pF * pH * pW = 2
+    F, H, W = 2, 2, 2
+    seq_len = F * H * W
+    latent_dim = model.config.in_channels
+
+    video_latents = torch.randn(seq_len, 1, latent_dim, dtype=torch.float32)
+    context_embeddings = torch.randn(4, 1, 8, dtype=torch.float32)
+    loss_mask = torch.ones(seq_len, dtype=torch.bfloat16)
+    grid_sizes = torch.tensor([[F, H, W]], dtype=torch.int32)
+
+    # Packed seq params with simple cumulative lengths
+    from megatron.core.packed_seq_params import PackedSeqParams
+    cu = torch.tensor([0, seq_len], dtype=torch.int32)
+    packed_seq_params = {
+        "self_attention": PackedSeqParams(
+            cu_seqlens_q=cu, cu_seqlens_q_padded=cu, cu_seqlens_kv=cu, cu_seqlens_kv_padded=cu, qkv_format="sbhd"
+        ),
+        "cross_attention": PackedSeqParams(
+            cu_seqlens_q=cu, cu_seqlens_q_padded=cu, cu_seqlens_kv=cu, cu_seqlens_kv_padded=cu, qkv_format="sbhd"
+        ),
+    }
+
+    batch = {
+        "video_latents": video_latents,
+        "context_embeddings": context_embeddings,
+        "loss_mask": loss_mask,
+        "grid_sizes": grid_sizes,
+        "packed_seq_params": packed_seq_params,
+        "video_metadata": {},
+    }
+
+    model_pred, weighted_loss, split_loss_mask = pipe.training_step(
+        model,
+        batch,
+        use_sigma_noise=True,
+        timestep_sampling="uniform",
+        flow_shift=3.0,
+        mix_uniform_ratio=1.0,  # force uniform branch
+        sigma_min=0.0,
+        sigma_max=1.0,
+    )
+
+    # Basic shape checks
+    assert model_pred.shape == video_latents.shape
+    assert weighted_loss.shape[:2] == video_latents.shape[:2]
+    assert split_loss_mask.shape == loss_mask.shape
+
+
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py
new file mode 100644
index 00000000..977fb55c
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py
@@ -0,0 +1,54 @@
+import torch
+
+from dfm.src.megatron.model.wan.flow_matching.time_shift_utils import (
+    time_shift,
+    compute_density_for_timestep_sampling,
+    get_flow_match_loss_weight,
+)
+
+
+def test_time_shift_constant_linear_sqrt_bounds_and_monotonic():
+    t_small = torch.tensor(0.1, dtype=torch.float32)
+    t_large = torch.tensor(0.9, dtype=torch.float32)
+    seq_len = 512
+
+    # constant
+    s_small = time_shift(t_small, image_seq_len=seq_len, shift_type="constant", constant=3.0)
+    s_large = time_shift(t_large, image_seq_len=seq_len, shift_type="constant", constant=3.0)
+    assert 0.0 <= s_small.item() <= 1.0
+    assert 0.0 <= s_large.item() <= 1.0
+    assert s_large > s_small
+
+    # linear
+    s_small = time_shift(t_small, image_seq_len=seq_len, shift_type="linear", base_shift=0.5, max_shift=1.15)
+    s_large = time_shift(t_large, image_seq_len=seq_len, shift_type="linear", base_shift=0.5, max_shift=1.15)
+    assert 0.0 <= s_small.item() <= 1.0
+    assert 0.0 <= s_large.item() <= 1.0
+    assert s_large > s_small
+
+    # sqrt
+    s_small = time_shift(t_small, image_seq_len=seq_len, shift_type="sqrt")
+    s_large = time_shift(t_large, image_seq_len=seq_len, shift_type="sqrt")
+    assert 0.0 <= s_small.item() <= 1.0
+    assert 0.0 <= s_large.item() <= 1.0
+    assert s_large > s_small
+
+
+def test_compute_density_for_timestep_sampling_modes_and_ranges():
+    batch_size = 16
+    for mode in ["uniform", "logit_normal", "mode"]:
+        u = compute_density_for_timestep_sampling(mode, batch_size=batch_size, logit_mean=0.0, logit_std=1.0)
+        assert u.shape == (batch_size,)
+        assert torch.all((0.0 <= u) & (u <= 1.0))
+
+
+def test_get_flow_match_loss_weight_simple_cases():
+    sigma = torch.zeros(5, dtype=torch.float32)
+    w = get_flow_match_loss_weight(sigma, shift=3.0)
+    assert torch.allclose(w, torch.ones_like(w))
+
+    sigma = torch.ones(5, dtype=torch.float32)
+    w = get_flow_match_loss_weight(sigma, shift=2.0)
+    assert torch.allclose(w, torch.full_like(sigma, 3.0))
+
+
diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py
new file mode 100644
index 00000000..1a0eae39
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py
@@ -0,0 +1,29 @@
+from dfm.src.megatron.model.wan.inference import SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
+
+
+def test_size_configs_structure_and_values():
+    assert isinstance(SIZE_CONFIGS, dict)
+    for key, val in SIZE_CONFIGS.items():
+        assert isinstance(key, str)
+        assert isinstance(val, tuple) and len(val) == 2
+        w, h = val
+        assert isinstance(w, int) and isinstance(h, int)
+        assert w > 0 and h > 0
+
+
+def test_max_area_configs_consistency():
+    for size_key, area in MAX_AREA_CONFIGS.items():
+        w, h = SIZE_CONFIGS[size_key]
+        assert area == w * h
+
+
+def test_supported_sizes_lists():
+    assert "t2v-14B" in SUPPORTED_SIZES
+    assert "t2v-1.3B" in SUPPORTED_SIZES
+    for model_key, sizes in SUPPORTED_SIZES.items():
+        assert isinstance(sizes, tuple)
+        for s in sizes:
+            assert s in SIZE_CONFIGS
+
+
+
diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py
new file mode 100644
index 00000000..b0389bbd
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py
@@ -0,0 +1,65 @@
+import os
+import tempfile
+
+import argparse
+import torch
+
+from dfm.src.megatron.model.wan.inference import utils as inf_utils
+
+
+def test_str2bool_variants_and_errors():
+    true_vals = ["yes", "true", "t", "y", "1", "TRUE", "Yes"]
+    false_vals = ["no", "false", "f", "n", "0", "FALSE", "No"]
+    for v in true_vals:
+        assert inf_utils.str2bool(v) is True
+    for v in false_vals:
+        assert inf_utils.str2bool(v) is False
+    assert inf_utils.str2bool(True) is True
+    assert inf_utils.str2bool(False) is False
+    try:
+        inf_utils.str2bool("maybe")
+    except argparse.ArgumentTypeError:
+        pass
+    else:
+        assert False, "Expected argparse.ArgumentTypeError for invalid boolean string"
+
+
+def test_cache_image_writes_file(tmp_path):
+    # Small 3x8x8 image
+    img = torch.rand(3, 8, 8)
+    out_path = tmp_path / "test.png"
+    saved = inf_utils.cache_image(img, str(out_path), nrow=1, normalize=False, value_range=(0.0, 1.0), retry=1)
+    assert saved == str(out_path)
+    assert os.path.exists(out_path)
+    assert os.path.getsize(out_path) > 0
+
+
+def test_cache_video_uses_writer_and_returns_path(monkeypatch):
+    # Stub imageio.get_writer to avoid codec dependency
+    calls = {"frames": 0, "path": None}
+
+    class _DummyWriter:
+        def __init__(self, path, fps=None, codec=None, quality=None):
+            calls["path"] = path
+        def append_data(self, frame):
+            calls["frames"] += 1
+        def close(self):
+            pass
+
+    monkeypatch.setattr(inf_utils.imageio, "get_writer", lambda path, fps, codec, quality: _DummyWriter(path, fps, codec, quality))
+
+    # Stub make_grid to return a fixed CHW tensor regardless of input
+    def _fake_make_grid(x, nrow, normalize, value_range):
+        return torch.rand(3, 4, 5)
+    monkeypatch.setattr(inf_utils.torchvision.utils, "make_grid", _fake_make_grid)
+
+    # Build a tensor whose unbind(2) yields 2 slices so we expect 2 frames written
+    vid = torch.rand(3, 3, 2, 2)  # shape chosen to exercise unbind(2)
+    with tempfile.TemporaryDirectory() as td:
+        out_file = os.path.join(td, "out.mp4")
+        result = inf_utils.cache_video(vid, save_file=out_file, fps=5, suffix=".mp4", nrow=1, normalize=False, value_range=(0.0, 1.0), retry=1)
+        assert result == out_file
+        assert calls["path"] == out_file
+        assert calls["frames"] == vid.shape[2]  # frames equal to number of unbinds on dim=2
+
+
diff --git a/tests/unit_tests/megatron/model/wan/test_rope_utils.py b/tests/unit_tests/megatron/model/wan/test_rope_utils.py
new file mode 100644
index 00000000..be7935b8
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/test_rope_utils.py
@@ -0,0 +1,38 @@
+import pytest
+import torch
+
+from dfm.src.megatron.model.wan.rope_utils import Wan3DRopeEmbeddings
+
+
+def test_wan3d_rope_embeddings_shapes_and_padding():
+    # Small, CPU-friendly config
+    n_head = 2
+    dim_head = 8  # must be divisible with the internal splits
+    max_position_len = 16
+    rope = Wan3DRopeEmbeddings(dim_head=dim_head, max_position_len=max_position_len)
+
+    # Two samples with different (f, h, w)
+    grid_sizes = torch.tensor([[2, 3, 2], [4, 1, 1]], dtype=torch.int32)
+    seq_lens = [(2 * 3 * 2), (4 * 1 * 1)]
+    padded_lens = [seq_lens[0] + 2, seq_lens[1]]  # pad first sample
+
+    cu_seqlens_q_padded = torch.tensor([0, padded_lens[0], padded_lens[0] + padded_lens[1]], dtype=torch.int32)
+
+    out = rope(
+        n_head=n_head,
+        dim_head=dim_head,
+        cu_seqlens_q_padded=cu_seqlens_q_padded,
+        grid_sizes=grid_sizes,
+        device=torch.device("cpu"),
+    )
+
+    # Total concatenated length equals sum of padded lens
+    assert out.shape == (sum(padded_lens), 1, 1, dim_head)
+
+    # Check that padding region for the first sample is zero
+    first_seq_len = seq_lens[0]
+    first_padded_len = padded_lens[0]
+    tail = out[first_seq_len:first_padded_len]
+    assert torch.all(tail == 0), "Padded region should be zeros"
+
+
diff --git a/tests/unit_tests/megatron/model/wan/test_utils.py b/tests/unit_tests/megatron/model/wan/test_utils.py
new file mode 100644
index 00000000..6ee48c75
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/test_utils.py
@@ -0,0 +1,36 @@
+import torch
+
+from dfm.src.megatron.model.wan.utils import grid_sizes_calculation, patchify, unpatchify
+
+
+def test_grid_sizes_calculation_basic():
+    input_shape = (4, 8, 6)
+    patch_size = (1, 2, 3)
+    f, h, w = grid_sizes_calculation(input_shape, patch_size)
+    assert (f, h, w) == (4, 4, 2)
+
+
+def test_patchify_unpatchify_roundtrip():
+    # Video latent: [c, F_patches * pF, H_patches * pH, W_patches * pW]
+    c = 3
+    F_patches, H_patches, W_patches = 2, 2, 3
+    patch_size = (1, 2, 2)
+    F_latents = F_patches * patch_size[0]
+    H_latents = H_patches * patch_size[1]
+    W_latents = W_patches * patch_size[2]
+
+    x = [torch.randn(c, F_latents, H_latents, W_latents)]
+
+    patches = patchify(x, patch_size)
+    assert isinstance(patches, list) and len(patches) == 1
+    seq_len, dim = patches[0].shape
+    assert seq_len == F_patches * H_patches * W_patches
+    assert dim == c * (patch_size[0] * patch_size[1] * patch_size[2])
+
+    # Unpatchify and compare
+    y = unpatchify(patches, [[F_patches, H_patches, W_patches]], out_dim=c, patch_size=patch_size)
+    assert isinstance(y, list) and len(y) == 1
+    assert y[0].shape == x[0].shape
+    torch.testing.assert_close(y[0], x[0], rtol=1e-5, atol=1e-5)
+
+
diff --git a/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py b/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py
new file mode 100644
index 00000000..de327a56
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py
@@ -0,0 +1,14 @@
+from dfm.src.megatron.model.wan.wan_layer_spec import get_wan_block_with_transformer_engine_spec
+
+
+def test_get_wan_block_with_transformer_engine_spec_basic():
+    spec = get_wan_block_with_transformer_engine_spec()
+    # Basic structure checks
+    assert hasattr(spec, "module")
+    assert hasattr(spec, "submodules")
+    sub = spec.submodules
+    # Expected submodule fields exist
+    for name in ["norm1", "norm2", "norm3", "full_self_attention", "cross_attention", "mlp"]:
+        assert hasattr(sub, name), f"Missing submodule {name}"
+
+
diff --git a/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py b/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py
new file mode 100644
index 00000000..09b65729
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py
@@ -0,0 +1,13 @@
+import torch
+
+from dfm.src.megatron.model.wan.wan_model import sinusoidal_embedding_1d
+
+
+def test_sinusoidal_embedding_1d_shape_and_dtype():
+    dim = 16
+    pos = torch.arange(10, dtype=torch.float32)
+    emb = sinusoidal_embedding_1d(dim, pos)
+    assert emb.shape == (pos.shape[0], dim)
+    assert emb.dtype == torch.float32
+
+
diff --git a/tests/unit_tests/megatron/model/wan/test_wan_provider.py b/tests/unit_tests/megatron/model/wan/test_wan_provider.py
new file mode 100644
index 00000000..ea337aa7
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/test_wan_provider.py
@@ -0,0 +1,55 @@
+import torch
+
+from dfm.src.megatron.model.wan.wan_provider import WanModelProvider
+from dfm.src.megatron.model.wan.wan_model import WanModel
+from megatron.core import parallel_state
+
+
+def test_wan_model_provider_provide_returns_model(monkeypatch):
+    # Force pipeline stage booleans to avoid dependency on initialized model parallel
+    monkeypatch.setattr(parallel_state, "is_pipeline_first_stage", lambda: True, raising=False)
+    monkeypatch.setattr(parallel_state, "is_pipeline_last_stage", lambda: True, raising=False)
+    # Avoid querying uninitialized PP groups
+    monkeypatch.setattr(parallel_state, "get_pipeline_model_parallel_world_size", lambda: 1, raising=False)
+
+    provider = WanModelProvider(
+        num_layers=2,  # keep small
+        hidden_size=64,
+        ffn_hidden_size=128,
+        num_attention_heads=4,
+        layernorm_epsilon=1e-6,
+        normalization="RMSNorm",
+        layernorm_zero_centered_gamma=False,
+        layernorm_across_heads=True,
+        add_qkv_bias=True,
+        rotary_interleaved=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        fp16_lm_cross_entropy=False,
+        parallel_output=True,
+        bf16=False,
+        params_dtype=torch.float32,
+        qkv_format="sbhd",
+        seq_length=128,
+        share_embeddings_and_output_weights=False,
+        vocab_size=32000,
+        make_vocab_size_divisible_by=128,
+        in_channels=4,
+        out_channels=4,
+        patch_spatial=2,
+        patch_temporal=1,
+        freq_dim=16,
+        text_len=32,
+        text_dim=64,
+    )
+    # Ensure config supplies fields expected by core attention
+    provider.kv_channels = provider.hidden_size // provider.num_attention_heads
+    provider.num_query_groups = provider.num_attention_heads
+    model = provider.provide()
+    assert isinstance(model, WanModel)
+    # Sanity check key config properties were plumbed
+    assert model.config.hidden_size == 64
+    assert model.config.num_attention_heads == 4
+    assert model.config.text_dim == 64
+
+
diff --git a/tests/unit_tests/megatron/model/wan/test_wan_step.py b/tests/unit_tests/megatron/model/wan/test_wan_step.py
new file mode 100644
index 00000000..5c04f3db
--- /dev/null
+++ b/tests/unit_tests/megatron/model/wan/test_wan_step.py
@@ -0,0 +1,50 @@
+import pytest
+import torch
+
+from dfm.src.megatron.model.wan.wan_step import wan_data_step, WanForwardStep
+
+
+class _DummyIter:
+    def __init__(self, batch):
+        # mimic attribute used inside wan_data_step
+        self.iterable = [batch]
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="wan_data_step moves tensors to CUDA")
+def test_wan_data_step_builds_packed_seq_params_cuda_guarded():
+    # Construct minimal batch with required seq_len fields
+    batch = {
+        "seq_len_q": torch.tensor([3, 5], dtype=torch.int32),
+        "seq_len_q_padded": torch.tensor([4, 6], dtype=torch.int32),
+        "seq_len_kv": torch.tensor([2, 7], dtype=torch.int32),
+        "seq_len_kv_padded": torch.tensor([2, 8], dtype=torch.int32),
+        # include a tensor field to exercise device transfer
+        "video_latents": torch.randn(8, 1, 4, dtype=torch.float32),
+    }
+    it = _DummyIter(batch)
+    qkv_format = "sbhd"
+    out = wan_data_step(qkv_format, it)
+
+    assert "packed_seq_params" in out
+    for k in ["self_attention", "cross_attention"]:
+        assert k in out["packed_seq_params"]
+        p = out["packed_seq_params"][k]
+        assert hasattr(p, "cu_seqlens_q")
+        assert hasattr(p, "cu_seqlens_q_padded")
+        assert hasattr(p, "cu_seqlens_kv")
+        assert hasattr(p, "cu_seqlens_kv_padded")
+    # spot-check CUDA device after move
+    assert out["video_latents"].is_cuda
+
+
+def test_wan_forward_step_loss_partial_creation():
+    step = WanForwardStep()
+    mask = torch.ones(4, dtype=torch.float32)
+    loss_fn = step._create_loss_function(mask, check_for_nan_in_loss=False, check_for_spiky_loss=False)
+    # Just validate it's callable and is a functools.partial
+    import functools
+
+    assert isinstance(loss_fn, functools.partial)
+    assert callable(loss_fn)
+
+

From 93f94bcfc0d692b6433f9f544e5287f86f14e65d Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Fri, 14 Nov 2025 08:22:51 -0800
Subject: [PATCH 02/20] ruff lint

---
 .../data/wan/test_wan_energon_datamodule.py   | 16 ++++++++++++---
 .../data/wan/test_wan_mock_datamodule.py      | 17 +++++++++++++---
 .../megatron/data/wan/test_wan_taskencoder.py |  7 +++++--
 .../test_flow_inference_pipeline.py           | 14 +++++++++++++
 .../wan/flow_matching/test_flow_pipeline.py   | 15 ++++++++++++++
 .../flow_matching/test_time_shift_utils.py    | 16 ++++++++++++++-
 .../wan/inference/test_inference_init.py      | 16 ++++++++++++++-
 .../wan/inference/test_inference_utils.py     | 16 ++++++++++++++-
 .../megatron/model/wan/test_rope_utils.py     | 15 +++++++++++++-
 .../megatron/model/wan/test_utils.py          | 16 +++++++++++++--
 .../megatron/model/wan/test_wan_layer_spec.py | 16 +++++++++++++--
 .../megatron/model/wan/test_wan_model_misc.py | 16 +++++++++++++--
 .../megatron/model/wan/test_wan_provider.py   | 20 +++++++++++++++----
 .../megatron/model/wan/test_wan_step.py       | 18 ++++++++++++++---
 14 files changed, 193 insertions(+), 25 deletions(-)

diff --git a/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py b/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py
index 6fb27798..c4dc6014 100644
--- a/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py
+++ b/tests/unit_tests/megatron/data/wan/test_wan_energon_datamodule.py
@@ -1,4 +1,16 @@
-import types
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from dfm.src.megatron.data.wan import wan_energon_datamodule as wan_dm_mod
 from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder
@@ -53,5 +65,3 @@ def test_wan_datamodule_config_initialization(monkeypatch):
     # build_datasets should return train loader thrice
     train, val, test = cfg.build_datasets(context=None)
     assert train == "train" and val == "train" and test == "train"
-
-
diff --git a/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py b/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py
index d8f10d74..e1980052 100644
--- a/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py
+++ b/tests/unit_tests/megatron/data/wan/test_wan_mock_datamodule.py
@@ -1,5 +1,18 @@
-import torch
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
+import torch
 from torch.utils.data import DataLoader
 
 from dfm.src.megatron.data.wan.wan_mock_datamodule import WanMockDataModuleConfig
@@ -50,5 +63,3 @@ def test_wan_mock_datamodule_build_and_batch_shapes():
     assert batch["seq_len_kv"].dtype == torch.int32
     assert batch["seq_len_kv_padded"].dtype == torch.int32
     assert batch["grid_sizes"].dtype == torch.int32
-
-
diff --git a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
index 2de5173d..c4ed63b3 100644
--- a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
+++ b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
 import torch
 
 from dfm.src.megatron.data.wan.wan_taskencoder import WanTaskEncoder, cook, parallel_state
@@ -43,9 +42,13 @@ def test_encode_sample_no_context_parallel(monkeypatch):
     monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False)
     # Ensure seeded wrapper has an active worker config
     from megatron.energon.task_encoder.base import WorkerConfig
+
     class _FakeWorkerCfg:
-        def worker_seed(self): return 123
+        def worker_seed(self):
+            return 123
+
         active_worker_sample_index = 0
+
     monkeypatch.setattr(WorkerConfig, "active_worker_config", _FakeWorkerCfg(), raising=False)
 
     # Construct a minimal, consistent sample
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
index 4110c8e3..52e19708 100644
--- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 import pytest
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
index 1fcc3ba4..8511068e 100644
--- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
@@ -1,4 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import types
+
 import torch
 
 from dfm.src.megatron.model.wan.flow_matching import flow_pipeline as flow_pipeline_mod
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py
index 977fb55c..9739c626 100644
--- a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py
@@ -1,9 +1,23 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
 from dfm.src.megatron.model.wan.flow_matching.time_shift_utils import (
-    time_shift,
     compute_density_for_timestep_sampling,
     get_flow_match_loss_weight,
+    time_shift,
 )
 
 
diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py
index 1a0eae39..514857fd 100644
--- a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py
+++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py
@@ -1,4 +1,18 @@
-from dfm.src.megatron.model.wan.inference import SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dfm.src.megatron.model.wan.inference import MAX_AREA_CONFIGS, SIZE_CONFIGS, SUPPORTED_SIZES
 
 
 def test_size_configs_structure_and_values():
diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py
index b0389bbd..022cddce 100644
--- a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py
+++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py
@@ -1,7 +1,21 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
 import os
 import tempfile
 
-import argparse
 import torch
 
 from dfm.src.megatron.model.wan.inference import utils as inf_utils
diff --git a/tests/unit_tests/megatron/model/wan/test_rope_utils.py b/tests/unit_tests/megatron/model/wan/test_rope_utils.py
index be7935b8..e9451c64 100644
--- a/tests/unit_tests/megatron/model/wan/test_rope_utils.py
+++ b/tests/unit_tests/megatron/model/wan/test_rope_utils.py
@@ -1,4 +1,17 @@
-import pytest
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
 from dfm.src.megatron.model.wan.rope_utils import Wan3DRopeEmbeddings
diff --git a/tests/unit_tests/megatron/model/wan/test_utils.py b/tests/unit_tests/megatron/model/wan/test_utils.py
index 6ee48c75..3f89b4cd 100644
--- a/tests/unit_tests/megatron/model/wan/test_utils.py
+++ b/tests/unit_tests/megatron/model/wan/test_utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
 from dfm.src.megatron.model.wan.utils import grid_sizes_calculation, patchify, unpatchify
@@ -32,5 +46,3 @@ def test_patchify_unpatchify_roundtrip():
     assert isinstance(y, list) and len(y) == 1
     assert y[0].shape == x[0].shape
     torch.testing.assert_close(y[0], x[0], rtol=1e-5, atol=1e-5)
-
-
diff --git a/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py b/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py
index de327a56..21ee570e 100644
--- a/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py
+++ b/tests/unit_tests/megatron/model/wan/test_wan_layer_spec.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dfm.src.megatron.model.wan.wan_layer_spec import get_wan_block_with_transformer_engine_spec
 
 
@@ -10,5 +24,3 @@ def test_get_wan_block_with_transformer_engine_spec_basic():
     # Expected submodule fields exist
     for name in ["norm1", "norm2", "norm3", "full_self_attention", "cross_attention", "mlp"]:
         assert hasattr(sub, name), f"Missing submodule {name}"
-
-
diff --git a/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py b/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py
index 09b65729..de141def 100644
--- a/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py
+++ b/tests/unit_tests/megatron/model/wan/test_wan_model_misc.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
 from dfm.src.megatron.model.wan.wan_model import sinusoidal_embedding_1d
@@ -9,5 +23,3 @@ def test_sinusoidal_embedding_1d_shape_and_dtype():
     emb = sinusoidal_embedding_1d(dim, pos)
     assert emb.shape == (pos.shape[0], dim)
     assert emb.dtype == torch.float32
-
-
diff --git a/tests/unit_tests/megatron/model/wan/test_wan_provider.py b/tests/unit_tests/megatron/model/wan/test_wan_provider.py
index ea337aa7..bd451ff7 100644
--- a/tests/unit_tests/megatron/model/wan/test_wan_provider.py
+++ b/tests/unit_tests/megatron/model/wan/test_wan_provider.py
@@ -1,8 +1,22 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
+from megatron.core import parallel_state
 
-from dfm.src.megatron.model.wan.wan_provider import WanModelProvider
 from dfm.src.megatron.model.wan.wan_model import WanModel
-from megatron.core import parallel_state
+from dfm.src.megatron.model.wan.wan_provider import WanModelProvider
 
 
 def test_wan_model_provider_provide_returns_model(monkeypatch):
@@ -51,5 +65,3 @@ def test_wan_model_provider_provide_returns_model(monkeypatch):
     assert model.config.hidden_size == 64
     assert model.config.num_attention_heads == 4
     assert model.config.text_dim == 64
-
-
diff --git a/tests/unit_tests/megatron/model/wan/test_wan_step.py b/tests/unit_tests/megatron/model/wan/test_wan_step.py
index 5c04f3db..8ee0e9cb 100644
--- a/tests/unit_tests/megatron/model/wan/test_wan_step.py
+++ b/tests/unit_tests/megatron/model/wan/test_wan_step.py
@@ -1,7 +1,21 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pytest
 import torch
 
-from dfm.src.megatron.model.wan.wan_step import wan_data_step, WanForwardStep
+from dfm.src.megatron.model.wan.wan_step import WanForwardStep, wan_data_step
 
 
 class _DummyIter:
@@ -46,5 +60,3 @@ def test_wan_forward_step_loss_partial_creation():
 
     assert isinstance(loss_fn, functools.partial)
     assert callable(loss_fn)
-
-

From 244ee74ee6d93f2710bbb8fb835a985f6ff0b570 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Fri, 14 Nov 2025 08:34:06 -0800
Subject: [PATCH 03/20] ruff lint

---
 .../megatron/data/wan/test_wan_taskencoder.py | 29 +++++++++++++++----
 .../test_flow_inference_pipeline.py           |  8 +++--
 .../wan/flow_matching/test_flow_pipeline.py   |  8 +++--
 .../wan/inference/test_inference_utils.py     | 13 ++++++---
 .../megatron/model/wan/test_rope_utils.py     |  2 --
 5 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
index c4ed63b3..d190ef30 100644
--- a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
+++ b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
@@ -69,7 +69,10 @@ def worker_seed(self):
         "pickle": context_embeddings,
     }
 
-    enc = WanTaskEncoder(seq_length=1024, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=None)
+
+    enc = WanTaskEncoder(
+        seq_length=1024, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=None
+    )
     out = enc.encode_sample(sample)
 
     # Grid / patches
@@ -104,9 +107,13 @@ def test_batch_with_packing_buffer_size(monkeypatch):
     monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False)
     # Ensure seeded wrapper has an active worker config
     from megatron.energon.task_encoder.base import WorkerConfig
+
     class _FakeWorkerCfg:
-        def worker_seed(self): return 456
+        def worker_seed(self):
+            return 456
+
         active_worker_sample_index = 0
+
     monkeypatch.setattr(WorkerConfig, "active_worker_config", _FakeWorkerCfg(), raising=False)
 
     c = 4
@@ -122,17 +129,27 @@ def worker_seed(self): return 456
         "pickle": torch.randn(32, 128),
     }
 
-    enc = WanTaskEncoder(seq_length=256, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=3)
+    enc = WanTaskEncoder(
+        seq_length=256, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=3
+    )
     diff_sample = enc.encode_sample(sample)
     batch = enc.batch([diff_sample])
 
     assert isinstance(batch, dict)
-    for k in ["video_latents", "context_embeddings", "loss_mask", "seq_len_q", "seq_len_q_padded", "seq_len_kv", "seq_len_kv_padded", "grid_sizes", "video_metadata"]:
+    for k in [
+        "video_latents",
+        "context_embeddings",
+        "loss_mask",
+        "seq_len_q",
+        "seq_len_q_padded",
+        "seq_len_kv",
+        "seq_len_kv_padded",
+        "grid_sizes",
+        "video_metadata",
+    ]:
         assert k in batch
 
     # video_latents: [S, 1, ...], where S equals sample.video length when CP world size is 1
     assert batch["video_latents"].shape[1] == 1
     assert batch["context_embeddings"].shape[1] == 1
     assert batch["loss_mask"].shape[1] == 1
-
-
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
index 52e19708..82471a8e 100644
--- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
@@ -36,6 +36,7 @@ class _Cfg:
 
     # Instantiate object without running heavy init by patching __init__ to a no-op
     pip = object.__new__(FlowInferencePipeline)
+
     pip.inference_cfg = _Cfg()
 
     latest = FlowInferencePipeline._select_checkpoint_dir(pip, str(base), checkpoint_step=None)
@@ -55,15 +56,20 @@ class _Model:
         class _Cfg:
             hidden_size = 16
             qkv_format = "sbhd"
+
         config = _Cfg()
+
         def __call__(self, x, grid_sizes, t, **kwargs):
             return x  # echo input
+
         def set_input_tensor(self, x):
             pass
+
     pip.model = _Model()
 
     # Patch parallel state to no-PP path
     from megatron.core import parallel_state
+
     monkeypatch.setattr(parallel_state, "get_pipeline_model_parallel_world_size", lambda: 1, raising=False)
 
     S, B, H = 8, 1, pip.model.config.hidden_size
@@ -81,5 +87,3 @@ def set_input_tensor(self, x):
         arg_c=arg_c,
     )
     assert out.shape == latent_model_input.shape
-
-
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
index 8511068e..70fa1fc1 100644
--- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
@@ -37,13 +37,17 @@ def __call__(self, x, grid_sizes, t, context, packed_seq_params):
 def test_flow_pipeline_training_step_cpu_stub(monkeypatch):
     # Bypass heavy diffusers init
     def _stub_init(self, model_id="x", seed=0):
-        self.pipe = types.SimpleNamespace(scheduler=types.SimpleNamespace(config=types.SimpleNamespace(num_train_timesteps=1000)))
+        self.pipe = types.SimpleNamespace(
+            scheduler=types.SimpleNamespace(config=types.SimpleNamespace(num_train_timesteps=1000))
+        )
+
     monkeypatch.setattr(FlowPipeline, "__init__", _stub_init)
 
     # Make patchify accept both tensor and list for this test
     def _safe_patchify(x, patch_size):
         # Always delegate to the real implementation in utils to avoid recursion
         from dfm.src.megatron.model.wan import utils as wan_utils
+
         impl = wan_utils.patchify
         # Normalize inputs to expected 4D [C, F, H, W] without batch dim
         if isinstance(x, list):
@@ -115,5 +119,3 @@ def _safe_patchify(x, patch_size):
     assert model_pred.shape == video_latents.shape
     assert weighted_loss.shape[:2] == video_latents.shape[:2]
     assert split_loss_mask.shape == loss_mask.shape
-
-
diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py
index 022cddce..8445fddd 100644
--- a/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py
+++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_utils.py
@@ -55,25 +55,30 @@ def test_cache_video_uses_writer_and_returns_path(monkeypatch):
     class _DummyWriter:
         def __init__(self, path, fps=None, codec=None, quality=None):
             calls["path"] = path
+
         def append_data(self, frame):
             calls["frames"] += 1
+
         def close(self):
             pass
 
-    monkeypatch.setattr(inf_utils.imageio, "get_writer", lambda path, fps, codec, quality: _DummyWriter(path, fps, codec, quality))
+    monkeypatch.setattr(
+        inf_utils.imageio, "get_writer", lambda path, fps, codec, quality: _DummyWriter(path, fps, codec, quality)
+    )
 
     # Stub make_grid to return a fixed CHW tensor regardless of input
     def _fake_make_grid(x, nrow, normalize, value_range):
         return torch.rand(3, 4, 5)
+
     monkeypatch.setattr(inf_utils.torchvision.utils, "make_grid", _fake_make_grid)
 
     # Build a tensor whose unbind(2) yields 2 slices so we expect 2 frames written
     vid = torch.rand(3, 3, 2, 2)  # shape chosen to exercise unbind(2)
     with tempfile.TemporaryDirectory() as td:
         out_file = os.path.join(td, "out.mp4")
-        result = inf_utils.cache_video(vid, save_file=out_file, fps=5, suffix=".mp4", nrow=1, normalize=False, value_range=(0.0, 1.0), retry=1)
+        result = inf_utils.cache_video(
+            vid, save_file=out_file, fps=5, suffix=".mp4", nrow=1, normalize=False, value_range=(0.0, 1.0), retry=1
+        )
         assert result == out_file
         assert calls["path"] == out_file
         assert calls["frames"] == vid.shape[2]  # frames equal to number of unbinds on dim=2
-
-
diff --git a/tests/unit_tests/megatron/model/wan/test_rope_utils.py b/tests/unit_tests/megatron/model/wan/test_rope_utils.py
index e9451c64..7e31d8d0 100644
--- a/tests/unit_tests/megatron/model/wan/test_rope_utils.py
+++ b/tests/unit_tests/megatron/model/wan/test_rope_utils.py
@@ -47,5 +47,3 @@ def test_wan3d_rope_embeddings_shapes_and_padding():
     first_padded_len = padded_lens[0]
     tail = out[first_seq_len:first_padded_len]
     assert torch.all(tail == 0), "Padded region should be zeros"
-
-

From 550f283a5a3455e5b33e2d4a43aa124e2c6d0299 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Fri, 14 Nov 2025 08:51:38 -0800
Subject: [PATCH 04/20] ruff lint

---
 tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py     | 1 -
 .../model/wan/flow_matching/test_flow_inference_pipeline.py    | 1 +
 .../megatron/model/wan/flow_matching/test_flow_pipeline.py     | 3 +++
 .../megatron/model/wan/flow_matching/test_time_shift_utils.py  | 2 --
 .../megatron/model/wan/inference/test_inference_init.py        | 3 ---
 5 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
index d190ef30..e739a1ec 100644
--- a/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
+++ b/tests/unit_tests/megatron/data/wan/test_wan_taskencoder.py
@@ -69,7 +69,6 @@ def worker_seed(self):
         "pickle": context_embeddings,
     }
 
-
     enc = WanTaskEncoder(
         seq_length=1024, patch_temporal=patch_temporal, patch_spatial=patch_spatial, packing_buffer_size=None
     )
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
index 82471a8e..95e2cfca 100644
--- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_inference_pipeline.py
@@ -52,6 +52,7 @@ class _Cfg:
 def test_forward_pp_step_no_pp(monkeypatch):
     # Build a minimal instance skipping heavy init
     pip = object.__new__(FlowInferencePipeline)
+
     class _Model:
         class _Cfg:
             hidden_size = 16
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
index 70fa1fc1..d93a4298 100644
--- a/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_flow_pipeline.py
@@ -63,10 +63,12 @@ def _safe_patchify(x, patch_size):
                 t = t.squeeze(0)
             x_norm = [t]
         return impl(x_norm, patch_size)
+
     monkeypatch.setattr(flow_pipeline_mod, "patchify", _safe_patchify)
 
     # Disable context parallelism and force last pipeline stage
     from megatron.core import parallel_state
+
     monkeypatch.setattr(parallel_state, "get_context_parallel_world_size", lambda: 1, raising=False)
     monkeypatch.setattr(parallel_state, "is_pipeline_last_stage", lambda: True, raising=False)
 
@@ -85,6 +87,7 @@ def _safe_patchify(x, patch_size):
 
     # Packed seq params with simple cumulative lengths
     from megatron.core.packed_seq_params import PackedSeqParams
+
     cu = torch.tensor([0, seq_len], dtype=torch.int32)
     packed_seq_params = {
         "self_attention": PackedSeqParams(
diff --git a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py
index 9739c626..b239584c 100644
--- a/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py
+++ b/tests/unit_tests/megatron/model/wan/flow_matching/test_time_shift_utils.py
@@ -64,5 +64,3 @@ def test_get_flow_match_loss_weight_simple_cases():
     sigma = torch.ones(5, dtype=torch.float32)
     w = get_flow_match_loss_weight(sigma, shift=2.0)
     assert torch.allclose(w, torch.full_like(sigma, 3.0))
-
-
diff --git a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py
index 514857fd..f8005047 100644
--- a/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py
+++ b/tests/unit_tests/megatron/model/wan/inference/test_inference_init.py
@@ -38,6 +38,3 @@ def test_supported_sizes_lists():
         assert isinstance(sizes, tuple)
         for s in sizes:
             assert s in SIZE_CONFIGS
-
-
-

From 926a9513a36c2773077262eaa9fb465e9207ab9a Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 13:19:06 -0800
Subject: [PATCH 05/20] Explicit mcore path override to use Megatron-Bridge's
 pinned submodule commit

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 56e934d9..05a40a68 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -132,6 +132,7 @@ explicit = true
 [tool.uv.sources]
 nemo-automodel = { path = "3rdparty/Automodel" }
 megatron-bridge = { path = "3rdparty/Megatron-Bridge" }
+megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" }
 transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" }
 nvidia-resiliency-ext = { index = "pypi" }
 

From ecdef9e80256ff5d2e42508d09847c4978b2be22 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 14:04:03 -0800
Subject: [PATCH 06/20] Update Megatron-Bridge submodule to latest main with
 correct Megatron-LM commit (3cbe5c68)

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 3rdparty/Megatron-Bridge | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge
index 8e21f81a..4e4ce420 160000
--- a/3rdparty/Megatron-Bridge
+++ b/3rdparty/Megatron-Bridge
@@ -1 +1 @@
-Subproject commit 8e21f81ab961bdb0ad99a275074fe50aae15d2f9
+Subproject commit 4e4ce4203589466d0a5b846e12dd24fa74c57f2a

From 50385183f621feb8351d6bd7888249d235baa8ec Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 14:25:20 -0800
Subject: [PATCH 07/20] Add Mcore WAN pretrain mock test to CI/CD

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 .github/workflows/cicd-main.yml               |   3 +
 .../L2_Mcore_Mock_Tests_GPU.sh                |  15 +++
 .../test_mcore_wan_pretrain.py                | 109 ++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
 create mode 100644 tests/functional_tests/test_mcore_wan_pretrain.py

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index b3f08665..7f792d47 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -92,6 +92,9 @@ jobs:
           - script: L2_Functional_Tests_GPU
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
             timeout: 30
+          - script: L2_Mcore_Mock_Tests_GPU
+            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+            timeout: 30
     needs: [cicd-unit-tests]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
new file mode 100644
index 00000000..871b9e6a
--- /dev/null
+++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
+
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
new file mode 100644
index 00000000..b19836af
--- /dev/null
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functional smoke tests for Mcore WAN pretrain mock runs."""
+
+import os
+import subprocess
+import tempfile
+
+import pytest
+
+
+class TestMcoreWanPretrain:
+    """Test class for Mcore WAN pretrain functional tests."""
+
+    @pytest.mark.run_only_on("GPU")
+    def test_wan_pretrain_mock(self, tmp_path):
+        """
+        Functional test for WAN pretrain recipe with mock data.
+
+        This test verifies that the WAN pretrain recipe can run successfully
+        in mock mode with minimal configuration, ensuring:
+        1. The distributed training can start without errors
+        2. Model initialization works correctly
+        3. Forward/backward passes complete successfully
+        4. The training loop executes without crashes
+        """
+        # Set up temporary directories for dataset and checkpoints
+        dataset_path = os.path.join(tmp_path, "mock_dataset")
+        checkpoint_dir = os.path.join(tmp_path, "checkpoints")
+        os.makedirs(dataset_path, exist_ok=True)
+        os.makedirs(checkpoint_dir, exist_ok=True)
+
+        # Build the command for the mock run
+        cmd = [
+            "python",
+            "-m",
+            "torch.distributed.run",
+            "--nproc_per_node=1",
+            "examples/megatron/recipes/wan/pretrain_wan.py",
+            "--training-mode",
+            "pretrain",
+            "model.tensor_model_parallel_size=1",
+            "model.pipeline_model_parallel_size=1",
+            "model.context_parallel_size=1",
+            "model.crossattn_emb_size=1536",
+            "model.hidden_size=1536",
+            "model.ffn_hidden_size=8960",
+            "model.num_attention_heads=12",
+            "model.num_layers=3",
+            "model.qkv_format=thd",
+            f"dataset.path={dataset_path}",
+            f"checkpoint.save={checkpoint_dir}",
+            f"checkpoint.load={checkpoint_dir}",
+            "checkpoint.load_optim=false",
+            "checkpoint.save_interval=200",
+            "optimizer.lr=5e-6",
+            "optimizer.min_lr=5e-6",
+            "train.eval_iters=0",
+            "scheduler.lr_decay_style=constant",
+            "scheduler.lr_warmup_iters=0",
+            "model.seq_length=2048",
+            "dataset.seq_length=2048",
+            "train.global_batch_size=2",
+            "train.micro_batch_size=1",
+            "dataset.global_batch_size=2",
+            "dataset.micro_batch_size=1",
+            "logger.log_interval=1",
+            "--mock",
+        ]
+
+        # Run the command with a timeout
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=300,  # 5 minute timeout
+                check=True,
+            )
+
+            # Print output for debugging if needed
+            print("STDOUT:", result.stdout)
+            print("STDERR:", result.stderr)
+
+            # Basic verification that the run completed
+            assert result.returncode == 0, f"Command failed with return code {result.returncode}"
+
+            # Check for common success indicators in output
+            assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), (
+                "Expected to see iteration progress in output"
+            )
+
+        except subprocess.TimeoutExpired:
+            pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds")
+        except subprocess.CalledProcessError as e:
+            pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
+

From c746d180d102a415412e28bed4e27eff8ce40347 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 14:30:53 -0800
Subject: [PATCH 08/20] lintfix

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh | 1 -
 tests/functional_tests/test_mcore_wan_pretrain.py | 2 --
 2 files changed, 3 deletions(-)

diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
index 871b9e6a..2e99db05 100644
--- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
+++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
@@ -12,4 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
-
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
index b19836af..780bb253 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -16,7 +16,6 @@
 
 import os
 import subprocess
-import tempfile
 
 import pytest
 
@@ -106,4 +105,3 @@ def test_wan_pretrain_mock(self, tmp_path):
             pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds")
         except subprocess.CalledProcessError as e:
             pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
-

From 697201d23eb0417cf5cbb126a679719a35ed7f79 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Fri, 14 Nov 2025 17:16:42 -0800
Subject: [PATCH 09/20] Fix slow Docker build from Megatron-LM source

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 docker/Dockerfile.ci | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index 7096b3c6..8de9c016 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -32,7 +32,19 @@ RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
 # Copy dependency files and source code (needed for dynamic version resolution)
 COPY pyproject.toml uv.lock ./
 COPY dfm ./dfm
-COPY 3rdparty ./3rdparty
+
+# Copy 3rdparty dependencies with minimal files for metadata resolution
+# Copy Automodel
+COPY 3rdparty/Automodel ./3rdparty/Automodel
+
+# Copy Megatron-Bridge
+COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/
+COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src
+
+# Copy minimal Megatron-LM files for metadata (prevents full source build)
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
+COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
 
 # Install dependencies in two steps:
 # 1. Install build dependencies first (required for packages with no-build-isolation)

From 013ca6da5d4291401218b2dafe8ce4ebc7dd88bc Mon Sep 17 00:00:00 2001
From: Charlie Truong <chtruong@nvidia.com>
Date: Sat, 15 Nov 2025 21:33:27 -0600
Subject: [PATCH 10/20] ci: Update gpu runners to use self-hosted-nemo (#48)

* ci: Update gpu runners to use self-hosted-nemo

Signed-off-by: Charlie Truong <chtruong@nvidia.com>

* Use uv run in test_mcore_wan_pretrain

Signed-off-by: Charlie Truong <chtruong@nvidia.com>

* Ensure uv group megatron-bridge is used for test_mcore_wan_pretrain

Signed-off-by: Charlie Truong <chtruong@nvidia.com>

* Update TRANSFORMERS_OFFLINE environment variable to 0 and increase timeout in test_mcore_wan_pretrain

* Update TRANSFORMERS_OFFLINE environment variable to 0 and increase timeout in test_mcore_wan_pretrain

Signed-off-by: Charlie Truong <chtruong@nvidia.com>

* Revert GHA changes

Signed-off-by: Charlie Truong <chtruong@nvidia.com>

* Move uv run group call to L2_Mcore_Mock_Tests_GPU

Signed-off-by: Charlie Truong <chtruong@nvidia.com>

* Set test back to 5 minute timeout

Signed-off-by: Charlie Truong <chtruong@nvidia.com>

* Megatron fixes (#49)

* Enhance DiT and Wan layer specifications

- Updated `get_query_key_value_tensors` method in `dit_attention.py` to include an `output_gate` parameter and set `split_qkv` to default to `True`.
- Modified `WanLayerWithAdaLN` class in `wan_layer_spec.py` to add `rotary_pos_cos_sin` parameter for improved positional encoding handling.

* Implement ProcessGroupCollection initialization in DiT and Wan models

- Added initialization of `pg_collection` in both `DiTCrossAttentionModel` and `WanModel` to ensure proper handling of process groups.
- This change checks if `pg_collection` exists and is not None before assigning it, enhancing the robustness of the models.

* Update CONTRIBUTING.md to include detailed setup instructions for development environment and Docker container usage. Added sections for building and running the container, as well as setting the PYTHONPATH for DFM.

* Refactor import statements in dit_model.py to streamline dependencies. Removed redundant import of ProcessGroupCollection, enhancing code clarity and maintainability.

* Refactor code style in DiT and Wan models

- Updated string quotes in `dit_model.py` and `wan_model.py` for consistency, changing from single to double quotes.
- Reformatted the `get_query_key_value_tensors` method call in `dit_attention.py` for improved readability by breaking it into multiple lines.

* Revert M4 changes

* Ruff

* Ruff

* Lint

---------

Co-authored-by: Abhinav Garg <abhinavg@stanford.edu>

* Revert "Revert GHA changes"

This reverts commit d7ad1ab48b4d5f2fb00f1a51c84320228c1f64f3.

* tempfortest: timeout setting

Signed-off-by: Pablo Garay <pagaray@nvidia.com>

* workflow dispatch

Signed-off-by: Pablo Garay <pagaray@nvidia.com>

* update

Signed-off-by: Pablo Garay <pagaray@nvidia.com>

* add logging

Signed-off-by: Pablo Garay <pagaray@nvidia.com>

* Update test configuration for Mcore WAN pretraining

- Increased the number of processes per node from 1 to 2 for distributed training.
- Set the number of training iterations to 10 to enhance the training process.

* More changes

* Lint

---------

Signed-off-by: Charlie Truong <chtruong@nvidia.com>
Signed-off-by: Pablo Garay <pagaray@nvidia.com>
Co-authored-by: Abhinav Garg <abhinavg@stanford.edu>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>
Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 .github/actions/test-template/action.yml      |  2 +-
 .github/workflows/cicd-main.yml               | 80 ++++++++++---------
 CONTRIBUTING.md                               | 28 +++++++
 .../megatron/model/common/dit_attention.py    |  8 +-
 dfm/src/megatron/model/dit/dit_model.py       |  1 -
 dfm/src/megatron/model/wan/wan_layer_spec.py  |  1 +
 .../L2_Mcore_Mock_Tests_GPU.sh                |  2 +-
 .../test_mcore_wan_pretrain.py                | 15 ++--
 8 files changed, 83 insertions(+), 54 deletions(-)

diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml
index ccf10a80..ef0e5446 100644
--- a/.github/actions/test-template/action.yml
+++ b/.github/actions/test-template/action.yml
@@ -161,7 +161,7 @@ runs:
           -d \
           --name nemo_container_${{ github.run_id }} ${ARG[@]} \
           --shm-size=64g \
-          --env TRANSFORMERS_OFFLINE=1 \
+          --env TRANSFORMERS_OFFLINE=0 \
           --env HYDRA_FULL_ERROR=1 \
           --env HF_HOME=/home/TestData/HF_HOME \
           --env RUN_ID=${{ github.run_id }} \
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 7f792d47..fc8b2c25 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -13,6 +13,7 @@
 # limitations under the License.
 name: CICD NeMo
 on:
+  workflow_dispatch:
   schedule:
     - cron: 0 0 * * *
   push:
@@ -45,57 +46,58 @@ jobs:
     with:
       image-name: dfm
       dockerfile: docker/Dockerfile.ci
+      runner: self-hosted-nemo
     secrets:
       AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
       AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
       AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
-  cicd-unit-tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - script: L0_Unit_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
-            timeout: 30
-          - script: L0_Unit_Tests_CPU
-            runner: linux-amd64-cpu16
-            cpu-only: true
-    needs: [cicd-container-build]
-    runs-on: ${{ matrix.runner }}
-    name: ${{ matrix.script }}
-    environment: nemo-ci
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: main
-        uses: ./.github/actions/test-template
-        with:
-          runner: ${{ runner.name }}
-          script: ${{ matrix.script }}
-          timeout: ${{ matrix.timeout || 10 }}
-          is_unit_test: "true"
-          image: dfm
-          cpu-only: ${{ matrix.cpu-only || false }}
-          has-azure-credentials: "true"
-          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
-          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+  # cicd-unit-tests:
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - script: L0_Unit_Tests_GPU
+  #           runner: self-hosted-nemo
+  #           timeout: 30
+  #         - script: L0_Unit_Tests_CPU
+  #           runner: linux-amd64-cpu16
+  #           cpu-only: true
+  #   needs: [cicd-container-build]
+  #   runs-on: ${{ matrix.runner }}
+  #   name: ${{ matrix.script }}
+  #   environment: nemo-ci
+  #   steps:
+  #     - name: Checkout
+  #       uses: actions/checkout@v4
+  #       with:
+  #         submodules: recursive
+  #     - name: main
+  #       uses: ./.github/actions/test-template
+  #       with:
+  #         runner: ${{ runner.name }}
+  #         script: ${{ matrix.script }}
+  #         timeout: ${{ matrix.timeout || 10 }}
+  #         is_unit_test: "true"
+  #         image: dfm
+  #         cpu-only: ${{ matrix.cpu-only || false }}
+  #         has-azure-credentials: "true"
+  #         azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+  #         azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+  #         azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
   cicd-e2e-tests:
     strategy:
       fail-fast: false
       matrix:
         include:
-          - script: L2_Functional_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
-            timeout: 30
+          # - script: L2_Functional_Tests_GPU
+          #   runner: self-hosted-nemo
+          #   timeout: 30
           - script: L2_Mcore_Mock_Tests_GPU
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+            runner: self-hosted-nemo
             timeout: 30
-    needs: [cicd-unit-tests]
+    needs: [cicd-container-build]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -120,7 +122,7 @@ jobs:
   Nemo_CICD_Test:
     needs:
       - cicd-container-build
-      - cicd-unit-tests
+      # - cicd-unit-tests
       - cicd-e2e-tests
     if: always()
     runs-on: ubuntu-latest
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 68ab66d4..aed9cf99 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,32 @@
 # Contributing To NeMo DFM
+## 🛠️ Setting Up Your Environment
+
+Use the instructions below to setup a dev environment and a dev container
+
+### Building a container
+```bash
+# We recommend you to get the latest commits for Megatron-Bridge and Autmodel
+# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands
+git submodule update --init --recursive --remote # Get all the 3rd party submodules
+cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong
+# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty
+git checkout <commit_hash>
+cd ../../../../
+docker build -f docker/Dockerfile.ci -t dfm:latest .
+```
+
+### Run the container
+```bash
+docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash
+```
+
+### inside the container
+```bash
+# Add DFM to PYTHONPATH
+export PYTHONPATH=$PYTHONPATH:/opt/DFM
+
+# Run a Mock Run:
+```
 
 ## Signing Your Work
 
diff --git a/dfm/src/megatron/model/common/dit_attention.py b/dfm/src/megatron/model/common/dit_attention.py
index 321e9b08..acf39d47 100644
--- a/dfm/src/megatron/model/common/dit_attention.py
+++ b/dfm/src/megatron/model/common/dit_attention.py
@@ -100,7 +100,7 @@ def __init__(
         else:
             self.k_layernorm = None
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states=None, split_qkv=False):
+    def get_query_key_value_tensors(self, hidden_states, key_value_states=None, output_gate=None, split_qkv=True):
         """
         Derives `query`, `key` and `value` tensors from `hidden_states`.
         """
@@ -251,13 +251,15 @@ def __init__(
             is_expert=False,
         )
 
-    def get_query_key_value_tensors(self, hidden_states, key_value_states, split_qkv=False):
+    def get_query_key_value_tensors(self, hidden_states, key_value_states, output_gate=None, split_qkv=True):
         """
         Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
         from `key_value_states`.
         """
 
-        query, key, value = super().get_query_key_value_tensors(hidden_states, key_value_states)
+        query, key, value = super().get_query_key_value_tensors(
+            hidden_states, key_value_states, output_gate=output_gate, split_qkv=split_qkv
+        )
 
         # gather query and key heads across TP ranks if self.layernorm_across_heads is True
         if self.layernorm_across_heads and parallel_state.get_tensor_model_parallel_world_size() > 1:
diff --git a/dfm/src/megatron/model/dit/dit_model.py b/dfm/src/megatron/model/dit/dit_model.py
index e3ae8a29..38cb8422 100644
--- a/dfm/src/megatron/model/dit/dit_model.py
+++ b/dfm/src/megatron/model/dit/dit_model.py
@@ -105,7 +105,6 @@ def __init__(
         super(DiTCrossAttentionModel, self).__init__(config=config)
 
         self.config: TransformerConfig = config
-
         self.transformer_decoder_layer_spec = transformer_decoder_layer_spec()
         self.pre_process = pre_process
         self.post_process = post_process
diff --git a/dfm/src/megatron/model/wan/wan_layer_spec.py b/dfm/src/megatron/model/wan/wan_layer_spec.py
index 2b355930..a0d6354e 100644
--- a/dfm/src/megatron/model/wan/wan_layer_spec.py
+++ b/dfm/src/megatron/model/wan/wan_layer_spec.py
@@ -162,6 +162,7 @@ def forward(
         packed_seq_params=None,
         sequence_len_offset=None,
         inference_context=None,
+        rotary_pos_cos_sin=None,
     ):
         # the timestep embedding is stored in attention_mask argument
         timestep_emb = attention_mask
diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
index 2e99db05..b8d237a1 100644
--- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
+++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
@@ -11,4 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
+CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/test_mcore_wan_pretrain.py
index 780bb253..0c9879d9 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/test_mcore_wan_pretrain.py
@@ -46,7 +46,7 @@ def test_wan_pretrain_mock(self, tmp_path):
             "python",
             "-m",
             "torch.distributed.run",
-            "--nproc_per_node=1",
+            "--nproc_per_node=2",
             "examples/megatron/recipes/wan/pretrain_wan.py",
             "--training-mode",
             "pretrain",
@@ -67,6 +67,7 @@ def test_wan_pretrain_mock(self, tmp_path):
             "optimizer.lr=5e-6",
             "optimizer.min_lr=5e-6",
             "train.eval_iters=0",
+            "train.train_iters=10",
             "scheduler.lr_decay_style=constant",
             "scheduler.lr_warmup_iters=0",
             "model.seq_length=2048",
@@ -81,11 +82,12 @@ def test_wan_pretrain_mock(self, tmp_path):
 
         # Run the command with a timeout
         try:
+            # Stream output in real-time instead of capturing it
             result = subprocess.run(
                 cmd,
                 capture_output=True,
                 text=True,
-                timeout=300,  # 5 minute timeout
+                timeout=1800,  # 30 minute timeout
                 check=True,
             )
 
@@ -96,12 +98,7 @@ def test_wan_pretrain_mock(self, tmp_path):
             # Basic verification that the run completed
             assert result.returncode == 0, f"Command failed with return code {result.returncode}"
 
-            # Check for common success indicators in output
-            assert "iteration" in result.stdout.lower() or "iteration" in result.stderr.lower(), (
-                "Expected to see iteration progress in output"
-            )
-
         except subprocess.TimeoutExpired:
-            pytest.fail("WAN pretrain mock run exceeded timeout of 300 seconds")
+            pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)")
         except subprocess.CalledProcessError as e:
-            pytest.fail(f"WAN pretrain mock run failed with error:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
+            pytest.fail(f"WAN pretrain mock run failed with return code {e.returncode}")

From f240ccd896288bde0f543188b24e7247bae34e43 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Sat, 15 Nov 2025 21:37:03 -0800
Subject: [PATCH 11/20] Reapply "Revert GHA changes"

This reverts commit fdb911f729d2870e96266e34b7592819140ff2e7.

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 .github/workflows/cicd-main.yml | 76 ++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index fc8b2c25..962e3d6c 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -52,52 +52,52 @@ jobs:
       AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
       AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
-  # cicd-unit-tests:
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       include:
-  #         - script: L0_Unit_Tests_GPU
-  #           runner: self-hosted-nemo
-  #           timeout: 30
-  #         - script: L0_Unit_Tests_CPU
-  #           runner: linux-amd64-cpu16
-  #           cpu-only: true
-  #   needs: [cicd-container-build]
-  #   runs-on: ${{ matrix.runner }}
-  #   name: ${{ matrix.script }}
-  #   environment: nemo-ci
-  #   steps:
-  #     - name: Checkout
-  #       uses: actions/checkout@v4
-  #       with:
-  #         submodules: recursive
-  #     - name: main
-  #       uses: ./.github/actions/test-template
-  #       with:
-  #         runner: ${{ runner.name }}
-  #         script: ${{ matrix.script }}
-  #         timeout: ${{ matrix.timeout || 10 }}
-  #         is_unit_test: "true"
-  #         image: dfm
-  #         cpu-only: ${{ matrix.cpu-only || false }}
-  #         has-azure-credentials: "true"
-  #         azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
-  #         azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-  #         azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+  cicd-unit-tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU
+            runner: self-hosted-nemo
+            timeout: 30
+          - script: L0_Unit_Tests_CPU
+            runner: linux-amd64-cpu16
+            cpu-only: true
+    needs: [cicd-container-build]
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    environment: nemo-ci
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: main
+        uses: ./.github/actions/test-template
+        with:
+          runner: ${{ runner.name }}
+          script: ${{ matrix.script }}
+          timeout: ${{ matrix.timeout || 10 }}
+          is_unit_test: "true"
+          image: dfm
+          cpu-only: ${{ matrix.cpu-only || false }}
+          has-azure-credentials: "true"
+          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
   cicd-e2e-tests:
     strategy:
       fail-fast: false
       matrix:
         include:
-          # - script: L2_Functional_Tests_GPU
-          #   runner: self-hosted-nemo
-          #   timeout: 30
+          - script: L2_Functional_Tests_GPU
+            runner: self-hosted-nemo
+            timeout: 30
           - script: L2_Mcore_Mock_Tests_GPU
             runner: self-hosted-nemo
             timeout: 30
-    needs: [cicd-container-build]
+    needs: [cicd-unit-tests]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -122,7 +122,7 @@ jobs:
   Nemo_CICD_Test:
     needs:
       - cicd-container-build
-      # - cicd-unit-tests
+      - cicd-unit-tests
       - cicd-e2e-tests
     if: always()
     runs-on: ubuntu-latest

From 0964c625eff90ec2b637e59f090835731e924c76 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Sat, 15 Nov 2025 21:43:54 -0800
Subject: [PATCH 12/20] update path per request

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh               | 2 +-
 .../recipes/test_wan_pretrain.py}                               | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename tests/functional_tests/{test_mcore_wan_pretrain.py => mcore/recipes/test_wan_pretrain.py} (99%)

diff --git a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
index b8d237a1..7977af26 100644
--- a/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
+++ b/tests/functional_tests/L2_Mcore_Mock_Tests_GPU.sh
@@ -11,4 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/test_mcore_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
+CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/mcore/recipes/test_wan_pretrain.py -m "not pleasefixme" --with_downloads -v
diff --git a/tests/functional_tests/test_mcore_wan_pretrain.py b/tests/functional_tests/mcore/recipes/test_wan_pretrain.py
similarity index 99%
rename from tests/functional_tests/test_mcore_wan_pretrain.py
rename to tests/functional_tests/mcore/recipes/test_wan_pretrain.py
index 0c9879d9..ca68bd1c 100644
--- a/tests/functional_tests/test_mcore_wan_pretrain.py
+++ b/tests/functional_tests/mcore/recipes/test_wan_pretrain.py
@@ -102,3 +102,4 @@ def test_wan_pretrain_mock(self, tmp_path):
             pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)")
         except subprocess.CalledProcessError as e:
             pytest.fail(f"WAN pretrain mock run failed with return code {e.returncode}")
+

From d08b5af04c6a2e96f0281c3cc2dca3b2997cc6e8 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Sat, 15 Nov 2025 21:47:31 -0800
Subject: [PATCH 13/20] lintfix

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 tests/functional_tests/mcore/recipes/test_wan_pretrain.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/functional_tests/mcore/recipes/test_wan_pretrain.py b/tests/functional_tests/mcore/recipes/test_wan_pretrain.py
index ca68bd1c..0c9879d9 100644
--- a/tests/functional_tests/mcore/recipes/test_wan_pretrain.py
+++ b/tests/functional_tests/mcore/recipes/test_wan_pretrain.py
@@ -102,4 +102,3 @@ def test_wan_pretrain_mock(self, tmp_path):
             pytest.fail("WAN pretrain mock run exceeded timeout of 1800 seconds (30 minutes)")
         except subprocess.CalledProcessError as e:
             pytest.fail(f"WAN pretrain mock run failed with return code {e.returncode}")
-

From eebe7318021e66119e3047919fc344fc10603415 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Sat, 15 Nov 2025 22:32:59 -0800
Subject: [PATCH 14/20] update CONTRIBUTING.md

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 CONTRIBUTING.md | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index aed9cf99..1b34c164 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -5,19 +5,16 @@ Use the instructions below to setup a dev environment and a dev container
 
 ### Building a container
 ```bash
-# We recommend you to get the latest commits for Megatron-Bridge and Autmodel
-# The easiest way to do that might be to remove the 3rdparty directly completely before running the following commands
-git submodule update --init --recursive --remote # Get all the 3rd party submodules
-cd 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM # Megatron LM commit might be wrong
-# Get the right megatron commit from here: https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/3rdparty
-git checkout <commit_hash>
-cd ../../../../
+# Initialize all submodules (Megatron-Bridge, Automodel, and nested Megatron-LM)
+git submodule update --init --recursive
+
+# Build the container
 docker build -f docker/Dockerfile.ci -t dfm:latest .
 ```
 
 ### Run the container
 ```bash
-docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all $(pwd):/opt/DFM -it dfm:latest bash
+docker run --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --gpus all -v $(pwd):/opt/DFM -it dfm:latest bash
 ```
 
 ### inside the container
@@ -30,6 +27,8 @@ export PYTHONPATH=$PYTHONPATH:/opt/DFM
 
 ## Signing Your Work
 
+### Sign-Off (Required)
+
 * We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
 
   * Any contribution which contains commits that are not Signed-Off will not be accepted.
@@ -43,6 +42,32 @@ export PYTHONPATH=$PYTHONPATH:/opt/DFM
   Signed-off-by: Your Name <your@email.com>
   ```
 
+### Commit Verification (Recommended)
+
+* We recommend signing your commits with SSH or GPG to get the "Verified" badge on GitHub.
+
+* **SSH Signing (Easiest):**
+  ```bash
+  # Configure SSH signing
+  git config --global gpg.format ssh
+  git config --global user.signingkey ~/.ssh/id_rsa.pub  # or id_ed25519.pub
+  git config --global commit.gpgsign true
+  
+  # Add your SSH key as a "Signing Key" on GitHub: https://github.com/settings/keys
+  ```
+
+* **GPG Signing (Alternative):**
+  ```bash
+  # Generate a GPG key
+  gpg --full-generate-key
+  
+  # Configure GPG signing
+  git config --global user.signingkey YOUR_GPG_KEY_ID
+  git config --global commit.gpgsign true
+  
+  # Add your GPG key to GitHub: https://github.com/settings/keys
+  ```
+
 * Full text of the DCO:
 
   ```

From 6685a540d9dd8b087d268c857be32208677ccb97 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Sun, 16 Nov 2025 00:01:56 -0800
Subject: [PATCH 15/20] lintfix

Signed-off-by: Pablo Garay <pagaray@nvidia.com>
---
 CONTRIBUTING.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1b34c164..bb807bd7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -52,7 +52,7 @@ export PYTHONPATH=$PYTHONPATH:/opt/DFM
   git config --global gpg.format ssh
   git config --global user.signingkey ~/.ssh/id_rsa.pub  # or id_ed25519.pub
   git config --global commit.gpgsign true
-  
+
   # Add your SSH key as a "Signing Key" on GitHub: https://github.com/settings/keys
   ```
 
@@ -60,11 +60,11 @@ export PYTHONPATH=$PYTHONPATH:/opt/DFM
   ```bash
   # Generate a GPG key
   gpg --full-generate-key
-  
+
   # Configure GPG signing
   git config --global user.signingkey YOUR_GPG_KEY_ID
   git config --global commit.gpgsign true
-  
+
   # Add your GPG key to GitHub: https://github.com/settings/keys
   ```
 

From a5a109a7a4712eb7b526a0ba28ed84ab661ab1e9 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Sun, 16 Nov 2025 11:31:08 -0800
Subject: [PATCH 16/20] adding v run --group megatron-bridge

---
 tests/unit_tests/L0_Unit_Tests_CPU.sh | 2 +-
 tests/unit_tests/L0_Unit_Tests_GPU.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit_tests/L0_Unit_Tests_CPU.sh b/tests/unit_tests/L0_Unit_Tests_CPU.sh
index 081c1564..dd8c1ee4 100644
--- a/tests/unit_tests/L0_Unit_Tests_CPU.sh
+++ b/tests/unit_tests/L0_Unit_Tests_CPU.sh
@@ -14,4 +14,4 @@
 
 # Hide GPU from PyTorch by setting CUDA_VISIBLE_DEVICES to empty
 # This makes torch.cuda.is_available() return False
-CUDA_VISIBLE_DEVICES="" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads
+CUDA_VISIBLE_DEVICES="" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads
diff --git a/tests/unit_tests/L0_Unit_Tests_GPU.sh b/tests/unit_tests/L0_Unit_Tests_GPU.sh
index ae77eb3f..0468cab6 100644
--- a/tests/unit_tests/L0_Unit_Tests_GPU.sh
+++ b/tests/unit_tests/L0_Unit_Tests_GPU.sh
@@ -11,4 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads
+CUDA_VISIBLE_DEVICES="0,1" uv run --group megatron-bridge coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads

From f43aea30b1e2c84f56b8db162455a78bd2212108 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Sun, 16 Nov 2025 11:46:42 -0800
Subject: [PATCH 17/20] update test

---
 .../megatron/model/wan/test_wan_provider.py     | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/unit_tests/megatron/model/wan/test_wan_provider.py b/tests/unit_tests/megatron/model/wan/test_wan_provider.py
index bd451ff7..30d65e05 100644
--- a/tests/unit_tests/megatron/model/wan/test_wan_provider.py
+++ b/tests/unit_tests/megatron/model/wan/test_wan_provider.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 
 import torch
+import torch.nn as nn
 from megatron.core import parallel_state
 
 from dfm.src.megatron.model.wan.wan_model import WanModel
 from dfm.src.megatron.model.wan.wan_provider import WanModelProvider
+import dfm.src.megatron.model.wan.wan_model as wan_model_module
 
 
 def test_wan_model_provider_provide_returns_model(monkeypatch):
@@ -26,6 +28,21 @@ def test_wan_model_provider_provide_returns_model(monkeypatch):
     # Avoid querying uninitialized PP groups
     monkeypatch.setattr(parallel_state, "get_pipeline_model_parallel_world_size", lambda: 1, raising=False)
 
+    # Bypass Megatron's ProcessGroupCollection usage inside TransformerBlock during construction.
+    # CI does not initialize distributed groups; a dummy block suffices for construction checks.
+    class DummyTransformerBlock(nn.Module):
+        def __init__(self, *args, **kwargs):
+            super().__init__()
+            self.input_tensor = None
+
+        def set_input_tensor(self, input_tensor):
+            self.input_tensor = input_tensor
+
+        def forward(self, hidden_states, **kwargs):
+            return hidden_states
+
+    monkeypatch.setattr(wan_model_module, "TransformerBlock", DummyTransformerBlock, raising=False)
+
     provider = WanModelProvider(
         num_layers=2,  # keep small
         hidden_size=64,

From d2d983f91b26ae8b8457f91f19388d38045ef071 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Sun, 16 Nov 2025 11:51:52 -0800
Subject: [PATCH 18/20] ruff lint

---
 tests/unit_tests/megatron/model/wan/test_wan_provider.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit_tests/megatron/model/wan/test_wan_provider.py b/tests/unit_tests/megatron/model/wan/test_wan_provider.py
index 30d65e05..78541900 100644
--- a/tests/unit_tests/megatron/model/wan/test_wan_provider.py
+++ b/tests/unit_tests/megatron/model/wan/test_wan_provider.py
@@ -16,9 +16,9 @@
 import torch.nn as nn
 from megatron.core import parallel_state
 
+import dfm.src.megatron.model.wan.wan_model as wan_model_module
 from dfm.src.megatron.model.wan.wan_model import WanModel
 from dfm.src.megatron.model.wan.wan_provider import WanModelProvider
-import dfm.src.megatron.model.wan.wan_model as wan_model_module
 
 
 def test_wan_model_provider_provide_returns_model(monkeypatch):

From 166e8097604c6fc7669091ace7c640d2319962bd Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Mon, 17 Nov 2025 06:40:15 -0800
Subject: [PATCH 19/20] restore Dockerfile.ci

---
 docker/Dockerfile.ci | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index fadafa44..cfcc3a74 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -39,7 +39,8 @@ COPY 3rdparty/Automodel ./3rdparty/Automodel
 
 # Copy minimal Megatron-Bridge files for metadata (prevents full source build)
 COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/
-COPY 3rdparty/Megatron-Bridge/src ./3rdparty/Megatron-Bridge/src
+COPY 3rdparty/Megatron-Bridge/src/megatron/bridge/__init__.py ./3rdparty/Megatron-Bridge/src/megatron/bridge/
+COPY 3rdparty/Megatron-Bridge/src/megatron/bridge/package_info.py ./3rdparty/Megatron-Bridge/src/megatron/bridge/
 
 # Copy minimal Megatron-LM files for metadata (prevents full source build)
 COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/

From c1bde61dff413cc6f691adf0601c54db86d5b154 Mon Sep 17 00:00:00 2001
From: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Date: Mon, 17 Nov 2025 08:22:04 -0800
Subject: [PATCH 20/20] update  .github/workflows/cicd-main.yml

---
 .github/workflows/cicd-main.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 962e3d6c..2d1a28b5 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -91,9 +91,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - script: L2_Functional_Tests_GPU
-            runner: self-hosted-nemo
-            timeout: 30
           - script: L2_Mcore_Mock_Tests_GPU
             runner: self-hosted-nemo
             timeout: 30