From 2d486193985ff6da14638a6debd6d77fbf43f226 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Tue, 11 Nov 2025 19:35:23 -0800
Subject: [PATCH 01/34] init

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/examples/Automodel/pretrain/pretrain.py   | 29 +++++++++
 .../Automodel/pretrain/wan2_1_t2v_flow.yaml   | 64 +++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 dfm/examples/Automodel/pretrain/pretrain.py
 create mode 100644 dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml

diff --git a/dfm/examples/Automodel/pretrain/pretrain.py b/dfm/examples/Automodel/pretrain/pretrain.py
new file mode 100644
index 00000000..0d8b2a79
--- /dev/null
+++ b/dfm/examples/Automodel/pretrain/pretrain.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from Automodel.recipes.finetune import TrainWan21DiffusionRecipe
+from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
+
+
+def main(default_config_path="/opt/DFM/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml"):
+    cfg = parse_args_and_load_config(default_config_path)
+    recipe = TrainWan21DiffusionRecipe(cfg)
+    recipe.setup()
+    recipe.run_train_validation_loop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml b/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml
new file mode 100644
index 00000000..43f4075b
--- /dev/null
+++ b/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml
@@ -0,0 +1,64 @@
+seed: 42
+
+wandb:
+  project: wan-t2v-flow-matching-pretrain
+  mode: online
+  name: wan2_1_t2v_fm_pretrain
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+model:
+  pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+
+data:
+  dataloader:
+    _target_: Automodel.datasets.build_wan21_dataloader
+    meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/
+    batch_size: 1
+    num_workers: 2
+    device: cpu
+
+batch:
+  batch_size_per_node: 8
+
+training:
+  num_epochs: 100
+
+optim:
+  learning_rate: 5e-5
+  optimizer:
+    weight_decay: 0.1
+    betas: [0.9, 0.95]
+  # "warmup_steps": 1000,
+  # "lr_min": 1e-5,
+
+
+flow_matching:
+  use_sigma_noise: true
+  timestep_sampling: uniform
+  logit_mean: 0.0
+  logit_std: 1.5
+  flow_shift: 2.5
+  mix_uniform_ratio: 0.2
+  # "sigma_min": 0.0,  # PRETRAIN: No clamping, full range
+  # "sigma_max": 1.0,  # PRETRAIN: No clamping, full range
+
+fsdp:
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  dp_replicate_size: 1
+  dp_size: 8
+
+logging:
+  save_every: 1000
+  log_every: 2
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: /opt/DFM/wan_t2v_flow_outputs_base_recipe_fsdp_run_1/
+  model_save_format: torch_save
+  save_consolidated: false
+  restore_from: null
\ No newline at end of file

From eb61496fdc171ea38c959b88010f2f6771a5c6f0 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 10:19:13 -0800
Subject: [PATCH 02/34] add sigma_min/amx

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../flow_matching/training_step_t2v.py         | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/dfm/src/automodel/flow_matching/training_step_t2v.py b/dfm/src/automodel/flow_matching/training_step_t2v.py
index 0e7b9bc0..1d75c9e8 100644
--- a/dfm/src/automodel/flow_matching/training_step_t2v.py
+++ b/dfm/src/automodel/flow_matching/training_step_t2v.py
@@ -40,6 +40,8 @@ def step_fsdp_transformer_t2v(
     logit_std: float = 1.0,
     flow_shift: float = 3.0,
     mix_uniform_ratio: float = 0.1,
+    sigma_min: float = 0.0,  # Default: no clamping (pretrain)
+    sigma_max: float = 1.0,  # Default: no clamping (pretrain)
     global_step: int = 0,
 ) -> Tuple[torch.Tensor, Dict]:
     """
@@ -96,14 +98,26 @@ def step_fsdp_transformer_t2v(
         # Apply flow shift: σ = shift/(shift + (1/u - 1))
         u_clamped = torch.clamp(u, min=1e-5)  # Avoid division by zero
         sigma = flow_shift / (flow_shift + (1.0 / u_clamped - 1.0))
-        sigma = torch.clamp(sigma, 0.0, 1.0)
+
+        # Clamp sigma (only if not full range [0,1])
+        # Pretrain uses [0, 1], finetune uses [0.02, 0.55]
+        if sigma_min > 0.0 or sigma_max < 1.0:
+            sigma = torch.clamp(sigma, sigma_min, sigma_max)
+        else:
+            sigma = torch.clamp(sigma, 0.0, 1.0)
 
     else:
         # Simple uniform without shift
         u = torch.rand(size=(batch_size,), device=device)
-        sigma = u
+
+        # Clamp sigma (only if not full range [0,1])
+        if sigma_min > 0.0 or sigma_max < 1.0:
+            sigma = torch.clamp(u, sigma_min, sigma_max)
+        else:
+            sigma = u
         sampling_method = "uniform_no_shift"
 
+
     # ========================================================================
     # Manual Flow Matching Noise Addition
     # ========================================================================

From 44a1cb6f1c1a1f785a4c7566aa48bac2854414a9 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 10:20:50 -0800
Subject: [PATCH 03/34] add sigma_min/max

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/recipes/finetune.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/finetune.py
index 83c60d75..9684e7ae 100644
--- a/dfm/src/automodel/recipes/finetune.py
+++ b/dfm/src/automodel/recipes/finetune.py
@@ -198,6 +198,8 @@ def setup(self):
         self.logit_std = fm_cfg.get("logit_std", 1.0)
         self.flow_shift = fm_cfg.get("flow_shift", 3.0)
         self.mix_uniform_ratio = fm_cfg.get("mix_uniform_ratio", 0.1)
+        self.sigma_min = fm_cfg.get("sigma_min", 0.0)
+        self.sigma_max = fm_cfg.get("sigma_max", 1.0)
 
         logging.info(f"[INFO] Flow matching: {'ENABLED' if self.use_sigma_noise else 'DISABLED'}")
         if self.use_sigma_noise:
@@ -367,6 +369,8 @@ def run_train_validation_loop(self):
                             logit_std=self.logit_std,
                             flow_shift=self.flow_shift,
                             mix_uniform_ratio=self.mix_uniform_ratio,
+                            sigma_min=self.sigma_min,
+                            sigma_max=self.sigma_max,
                             global_step=global_step,
                         )
                     except Exception as exc:

From 275ac71a64beef29a3d536f99dfbc0cf44180f9a Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 10:40:54 -0800
Subject: [PATCH 04/34] rename fientune.py to train.py

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/examples/Automodel/pretrain/pretrain.py | 2 +-
 dfm/examples/automodel/finetune/finetune.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dfm/examples/Automodel/pretrain/pretrain.py b/dfm/examples/Automodel/pretrain/pretrain.py
index 0d8b2a79..f1038198 100644
--- a/dfm/examples/Automodel/pretrain/pretrain.py
+++ b/dfm/examples/Automodel/pretrain/pretrain.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from Automodel.recipes.finetune import TrainWan21DiffusionRecipe
+from Automodel.recipes.train import TrainWan21DiffusionRecipe
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
 
diff --git a/dfm/examples/automodel/finetune/finetune.py b/dfm/examples/automodel/finetune/finetune.py
index ae07451f..7d77162c 100644
--- a/dfm/examples/automodel/finetune/finetune.py
+++ b/dfm/examples/automodel/finetune/finetune.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from Automodel.recipes.finetune import TrainWan21DiffusionRecipe
+from Automodel.recipes.train import TrainWan21DiffusionRecipe
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
 

From 8490de5306e57a1465893198e1cea61626bd61a9 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 11:08:51 -0800
Subject: [PATCH 05/34] add from_config

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../_diffusers/auto_diffusion_pipeline.py     | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
index d9e8c3ce..22a1edd3 100644
--- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
+++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
@@ -154,3 +154,60 @@ def from_pretrained(
                 parallel_module = manager.parallelize(comp_module)
                 setattr(pipe, comp_name, parallel_module)
         return pipe, created_managers
+
+    @classmethod
+    def from_config(
+        cls,
+        pretrained_model_name_or_path: str,
+        *model_args,
+        parallel_scheme: Optional[Dict[str, Dict[str, Any]]] = None,
+        device: Optional[torch.device] = None,
+        torch_dtype: Any = "auto",
+        move_to_device: bool = True,
+        load_for_training: bool = False,
+        components_to_load: Optional[Iterable[str]] = None,
+        **kwargs,
+    ) -> tuple[DiffusionPipeline, Dict[str, FSDP2Manager]]:
+        config = WanTransformer3DModel.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder="transformer",
+            torch_dtype=torch_dtype,
+            **kwargs,
+        )
+        pipe: DiffusionPipeline = DiffusionPipeline.from_config(
+            config,
+            *model_args,
+            torch_dtype=torch_dtype,
+            **kwargs,
+        )
+        # Decide device
+        dev = _choose_device(device)
+
+        # Move modules to device/dtype first (helps avoid initial OOM during sharding)
+        if move_to_device:
+            for name, module in _iter_pipeline_modules(pipe):
+                if not components_to_load or name in components_to_load:
+                    logger.info("[INFO] Moving module: %s to device/dtype", name)
+                    _move_module_to_device(module, dev, torch_dtype)
+
+        # If loading for training, ensure the target module parameters are trainable
+        if load_for_training:
+            for name, module in _iter_pipeline_modules(pipe):
+                if not components_to_load or name in components_to_load:
+                    logger.info("[INFO] Ensuring params trainable: %s", name)
+                    _ensure_params_trainable(module, module_name=name)
+
+        # Use per-component FSDP2Manager init-args to parallelize components
+        created_managers: Dict[str, FSDP2Manager] = {}
+        if parallel_scheme is not None:
+            assert torch.distributed.is_initialized(), "Expect distributed environment to be initialized"
+            _init_parallelizer()
+            for comp_name, comp_module in _iter_pipeline_modules(pipe):
+                manager_args = parallel_scheme.get(comp_name)
+                if manager_args is None:
+                    continue
+                manager = FSDP2Manager(**manager_args)
+                created_managers[comp_name] = manager
+                parallel_module = manager.parallelize(comp_module)
+                setattr(pipe, comp_name, parallel_module)
+        return pipe, created_managers

From 1d1ca1dddfd6e4149bb03892ea25190c5bf83d4e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 19:56:51 -0800
Subject: [PATCH 06/34] pass scheduler and model

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../automodel/flow_matching/training_step_t2v.py    | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/dfm/src/automodel/flow_matching/training_step_t2v.py b/dfm/src/automodel/flow_matching/training_step_t2v.py
index 1d75c9e8..dc80ce74 100644
--- a/dfm/src/automodel/flow_matching/training_step_t2v.py
+++ b/dfm/src/automodel/flow_matching/training_step_t2v.py
@@ -28,8 +28,8 @@
 
 
 def step_fsdp_transformer_t2v(
-    pipe,
-    model_map: Dict,
+    scheduler,
+    model,
     batch,
     device,
     bf16,
@@ -76,7 +76,7 @@ def step_fsdp_transformer_t2v(
     # Flow Matching Timestep Sampling
     # ========================================================================
 
-    num_train_timesteps = pipe.scheduler.config.num_train_timesteps
+    num_train_timesteps = scheduler.config.num_train_timesteps
 
     if use_sigma_noise:
         use_uniform = torch.rand(1).item() < mix_uniform_ratio
@@ -117,7 +117,6 @@ def step_fsdp_transformer_t2v(
             sigma = u
         sampling_method = "uniform_no_shift"
 
-
     # ========================================================================
     # Manual Flow Matching Noise Addition
     # ========================================================================
@@ -200,10 +199,8 @@ def step_fsdp_transformer_t2v(
     # Forward Pass
     # ========================================================================
 
-    fsdp_model = model_map["transformer"]["fsdp_transformer"]
-
     try:
-        model_pred = fsdp_model(
+        model_pred = model(
             hidden_states=noisy_latents,
             timestep=timesteps_for_model,
             encoder_hidden_states=text_embeddings,
@@ -257,7 +254,7 @@ def step_fsdp_transformer_t2v(
         logger.info(f"[STEP {global_step}] LOSS DEBUG")
         logger.info("=" * 80)
         logger.info("[TARGET] Flow matching: v = ε - x_0")
-        logger.info(f"[PREDICTION] Scheduler type (inference only): {type(pipe.scheduler).__name__}")
+        logger.info(f"[PREDICTION] Scheduler type (inference only): {type(scheduler).__name__}")
         logger.info("")
         logger.info(f"[RANGES] Model pred: [{model_pred.min():.4f}, {model_pred.max():.4f}]")
         logger.info(f"[RANGES] Target (v): [{target.min():.4f}, {target.max():.4f}]")

From 16ab73eede8e018cddc1add66ed0df41db4923ff Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 19:57:32 -0800
Subject: [PATCH 07/34] update param

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/recipes/finetune.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/finetune.py
index 9684e7ae..88603e58 100644
--- a/dfm/src/automodel/recipes/finetune.py
+++ b/dfm/src/automodel/recipes/finetune.py
@@ -22,7 +22,7 @@
 import torch
 import torch.distributed as dist
 import wandb
-from Automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
+from Automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline
 from Automodel.flow_matching.training_step_t2v import (
     step_fsdp_transformer_t2v,
 )
@@ -51,10 +51,10 @@ def build_model_and_optimizer(
     dp_replicate_size: Optional[int] = None,
     use_hf_tp_plan: bool = False,
     optimizer_cfg: Optional[Dict[str, Any]] = None,
-) -> tuple[NeMoAutoDiffusionPipeline, dict[str, Dict[str, Any]], torch.optim.Optimizer, Any]:
+) -> tuple[NeMoWanPipeline, dict[str, Dict[str, Any]], torch.optim.Optimizer, Any]:
     """Build the WAN 2.1 diffusion model, parallel scheme, and optimizer."""
 
-    logging.info("[INFO] Building NeMoAutoDiffusionPipeline with transformer parallel scheme...")
+    logging.info("[INFO] Building NeMoWanPipeline with transformer parallel scheme...")
 
     if not dist.is_initialized():
         logging.info("[WARN] torch.distributed not initialized; proceeding in single-process mode")
@@ -84,7 +84,7 @@ def build_model_and_optimizer(
 
     parallel_scheme = {"transformer": manager_args}
 
-    pipe, created_managers = NeMoAutoDiffusionPipeline.from_pretrained(
+    pipe, created_managers = NeMoWanPipeline.from_pretrained(
         model_id,
         torch_dtype=bf16_dtype,
         device=device,
@@ -93,11 +93,7 @@ def build_model_and_optimizer(
         components_to_load=["transformer"],
     )
     fsdp2_manager = created_managers["transformer"]
-    transformer_module = getattr(pipe, "transformer", None)
-    if transformer_module is None:
-        raise RuntimeError("transformer not found in pipeline after parallelization")
-
-    model_map: dict[str, Dict[str, Any]] = {"transformer": {"fsdp_transformer": transformer_module}}
+    transformer_module = pipe.transformer
 
     trainable_params = [p for p in transformer_module.parameters() if p.requires_grad]
     if not trainable_params:
@@ -121,7 +117,7 @@ def build_model_and_optimizer(
 
     logging.info("[INFO] NeMoAutoDiffusion setup complete (pipeline + optimizer)")
 
-    return pipe, model_map, optimizer, fsdp2_manager.device_mesh
+    return pipe, optimizer, fsdp2_manager.device_mesh
 
 
 def build_lr_scheduler(
@@ -214,7 +210,7 @@ def setup(self):
         dp_replicate_size = fsdp_cfg.get("dp_replicate_size", None)
         use_hf_tp_plan = fsdp_cfg.get("use_hf_tp_plan", False)
 
-        (self.pipe, self.model_map, self.optimizer, self.device_mesh) = build_model_and_optimizer(
+        (self.pipe, self.optimizer, self.device_mesh) = build_model_and_optimizer(
             model_id=self.model_id,
             learning_rate=self.learning_rate,
             device=self.device,
@@ -229,7 +225,7 @@ def setup(self):
             optimizer_cfg=self.cfg.get("optim.optimizer", {}),
         )
 
-        self.model = self.model_map["transformer"]["fsdp_transformer"]
+        self.model = self.pipe.transformer
         self.peft_config = None
 
         batch_cfg = self.cfg.get("batch", {})
@@ -358,8 +354,8 @@ def run_train_validation_loop(self):
                 for micro_batch in batch_group:
                     try:
                         loss, _ = step_fsdp_transformer_t2v(
-                            pipe=self.pipe,
-                            model_map=self.model_map,
+                            scheduler=self.pipe.scheduler,
+                            model=self.model,
                             batch=micro_batch,
                             device=self.device,
                             bf16=self.bf16,

From 655c6e177a1aad33df24159daa1fe12df7c23d68 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 19:58:46 -0800
Subject: [PATCH 08/34] introduce NeMoWanPipeline

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../_diffusers/auto_diffusion_pipeline.py     | 60 +++++++++++--------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
index 22a1edd3..b7eb979a 100644
--- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
+++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import logging
 import os
 from typing import Any, Dict, Iterable, Optional, Tuple
@@ -19,7 +20,7 @@
 import torch
 import torch.nn as nn
 from Automodel.distributed.dfm_parallelizer import WanParallelizationStrategy
-from diffusers import DiffusionPipeline
+from diffusers import DiffusionPipeline, WanPipeline
 from nemo_automodel.components.distributed import parallelizer
 from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager
 from nemo_automodel.shared.utils import dtype_from_str
@@ -155,30 +156,48 @@ def from_pretrained(
                 setattr(pipe, comp_name, parallel_module)
         return pipe, created_managers
 
+
+class NeMoWanPipeline(WanPipeline):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        return NeMoAutoDiffusionPipeline.from_pretrained(*args, **kwargs)
+
     @classmethod
     def from_config(
         cls,
-        pretrained_model_name_or_path: str,
-        *model_args,
+        model_id,
+        torch_dtype: torch.dtype = torch.bfloat16,
+        config: dict = None,
         parallel_scheme: Optional[Dict[str, Dict[str, Any]]] = None,
         device: Optional[torch.device] = None,
-        torch_dtype: Any = "auto",
         move_to_device: bool = True,
-        load_for_training: bool = False,
         components_to_load: Optional[Iterable[str]] = None,
-        **kwargs,
-    ) -> tuple[DiffusionPipeline, Dict[str, FSDP2Manager]]:
-        config = WanTransformer3DModel.from_pretrained(
-            pretrained_model_name_or_path,
-            subfolder="transformer",
+    ):
+        # Load just the config
+        from diffusers import WanTransformer3DModel
+
+        if model_id is not None:
+            transformer = WanTransformer3DModel.from_pretrained(
+                model_id,
+                subfolder="transformer",
+                torch_dtype=torch.bfloat16,
+            )
+
+            # Get config and reinitialize with random weights
+            config = copy.deepcopy(transformer.config)
+            del transformer
+
+        # Initialize with random weights
+        transformer = WanTransformer3DModel.from_config(config)
+
+        # Load pipeline with random transformer
+        pipe = WanPipeline.from_pretrained(
+            model_id,
+            transformer=transformer,
             torch_dtype=torch_dtype,
-            **kwargs,
-        )
-        pipe: DiffusionPipeline = DiffusionPipeline.from_config(
-            config,
-            *model_args,
-            torch_dtype=torch_dtype,
-            **kwargs,
         )
         # Decide device
         dev = _choose_device(device)
@@ -190,13 +209,6 @@ def from_config(
                     logger.info("[INFO] Moving module: %s to device/dtype", name)
                     _move_module_to_device(module, dev, torch_dtype)
 
-        # If loading for training, ensure the target module parameters are trainable
-        if load_for_training:
-            for name, module in _iter_pipeline_modules(pipe):
-                if not components_to_load or name in components_to_load:
-                    logger.info("[INFO] Ensuring params trainable: %s", name)
-                    _ensure_params_trainable(module, module_name=name)
-
         # Use per-component FSDP2Manager init-args to parallelize components
         created_managers: Dict[str, FSDP2Manager] = {}
         if parallel_scheme is not None:

From 56bd770953e39701f548624af6deac2ff3c456b5 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 20:04:27 -0800
Subject: [PATCH 09/34] add mode

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml b/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml
index 43f4075b..0c44244f 100644
--- a/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml
+++ b/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml
@@ -11,6 +11,7 @@ dist_env:
 
 model:
   pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+  mode: pretrain
 
 data:
   dataloader:

From 72f4187257eb110ba6eb160f01b50a03358a89fa Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 20:05:16 -0800
Subject: [PATCH 10/34] update build_model_and_optimizer

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/recipes/finetune.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/finetune.py
index 88603e58..7777b3d2 100644
--- a/dfm/src/automodel/recipes/finetune.py
+++ b/dfm/src/automodel/recipes/finetune.py
@@ -40,6 +40,7 @@
 def build_model_and_optimizer(
     *,
     model_id: str,
+    finetune_mode: bool,
     learning_rate: float,
     device: torch.device,
     bf16_dtype: torch.dtype,
@@ -84,7 +85,9 @@ def build_model_and_optimizer(
 
     parallel_scheme = {"transformer": manager_args}
 
-    pipe, created_managers = NeMoWanPipeline.from_pretrained(
+    init_fn = NeMoWanPipeline.from_pretrained if finetune_mode else NeMoWanPipeline.from_config
+
+    pipe, created_managers = init_fn(
         model_id,
         torch_dtype=bf16_dtype,
         device=device,
@@ -212,6 +215,7 @@ def setup(self):
 
         (self.pipe, self.optimizer, self.device_mesh) = build_model_and_optimizer(
             model_id=self.model_id,
+            finetune_mode=self.cfg.get("model.mode", "finetune").lower() == "finetune",
             learning_rate=self.learning_rate,
             device=self.device,
             bf16_dtype=self.bf16,

From f06e7992092160a1f3c92e5a79f4a60e7f547c6e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 22:59:11 -0800
Subject: [PATCH 11/34] update

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/recipes/finetune.py | 57 +++++++++++----------------
 1 file changed, 22 insertions(+), 35 deletions(-)

diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/finetune.py
index 7777b3d2..5490e595 100644
--- a/dfm/src/automodel/recipes/finetune.py
+++ b/dfm/src/automodel/recipes/finetune.py
@@ -43,14 +43,9 @@ def build_model_and_optimizer(
     finetune_mode: bool,
     learning_rate: float,
     device: torch.device,
-    bf16_dtype: torch.dtype,
+    dtype: torch.dtype,
     cpu_offload: bool = False,
-    tp_size: int = 1,
-    cp_size: int = 1,
-    pp_size: int = 1,
-    dp_size: Optional[int] = None,
-    dp_replicate_size: Optional[int] = None,
-    use_hf_tp_plan: bool = False,
+    fsdp_cfg: Dict[str, Any] = {},
     optimizer_cfg: Optional[Dict[str, Any]] = None,
 ) -> tuple[NeMoWanPipeline, dict[str, Dict[str, Any]], torch.optim.Optimizer, Any]:
     """Build the WAN 2.1 diffusion model, parallel scheme, and optimizer."""
@@ -62,38 +57,42 @@ def build_model_and_optimizer(
 
     world_size = dist.get_world_size() if dist.is_initialized() else 1
 
-    if dp_size is None:
-        denom = max(1, tp_size * cp_size * pp_size)
-        dp_size = max(1, world_size // denom)
+    if fsdp_cfg.get("dp_size", None) is None:
+        denom = max(1, fsdp_cfg.get("tp_size", 1) * fsdp_cfg.get("cp_size", 1) * fsdp_cfg.get("pp_size", 1))
+        fsdp_cfg.dp_size = max(1, world_size // denom)
 
     manager_args: Dict[str, Any] = {
-        "dp_size": dp_size,
-        "dp_replicate_size": dp_replicate_size,
-        "tp_size": tp_size,
-        "cp_size": cp_size,
-        "pp_size": pp_size,
+        "dp_size": fsdp_cfg.get("dp_size", None),
+        "dp_replicate_size": fsdp_cfg.get("dp_replicate_size", None),
+        "tp_size": fsdp_cfg.get("tp_size", 1),
+        "cp_size": fsdp_cfg.get("cp_size", 1),
+        "pp_size": fsdp_cfg.get("pp_size", 1),
         "backend": "nccl",
         "world_size": world_size,
-        "use_hf_tp_plan": use_hf_tp_plan,
+        "use_hf_tp_plan": fsdp_cfg.get("use_hf_tp_plan", False),
         "activation_checkpointing": True,
         "mp_policy": MixedPrecisionPolicy(
-            param_dtype=bf16_dtype,
-            reduce_dtype=bf16_dtype,
-            output_dtype=bf16_dtype,
+            param_dtype=dtype,
+            reduce_dtype=dtype,
+            output_dtype=dtype,
         ),
     }
 
     parallel_scheme = {"transformer": manager_args}
 
+    kwargs = {}
+    if finetune_mode:
+        kwargs["load_for_training"] = True
+        kwargs["low_cpu_mem_usage"] = True
     init_fn = NeMoWanPipeline.from_pretrained if finetune_mode else NeMoWanPipeline.from_config
 
     pipe, created_managers = init_fn(
         model_id,
-        torch_dtype=bf16_dtype,
+        torch_dtype=dtype,
         device=device,
         parallel_scheme=parallel_scheme,
-        load_for_training=True,
         components_to_load=["transformer"],
+        **kwargs,
     )
     fsdp2_manager = created_managers["transformer"]
     transformer_module = pipe.transformer
@@ -206,26 +205,14 @@ def setup(self):
             logging.info(f"[INFO]   - Flow shift: {self.flow_shift}")
             logging.info(f"[INFO]   - Mix uniform ratio: {self.mix_uniform_ratio}")
 
-        tp_size = fsdp_cfg.get("tp_size", 1)
-        cp_size = fsdp_cfg.get("cp_size", 1)
-        pp_size = fsdp_cfg.get("pp_size", 1)
-        dp_size = fsdp_cfg.get("dp_size", None)
-        dp_replicate_size = fsdp_cfg.get("dp_replicate_size", None)
-        use_hf_tp_plan = fsdp_cfg.get("use_hf_tp_plan", False)
-
         (self.pipe, self.optimizer, self.device_mesh) = build_model_and_optimizer(
             model_id=self.model_id,
             finetune_mode=self.cfg.get("model.mode", "finetune").lower() == "finetune",
             learning_rate=self.learning_rate,
             device=self.device,
-            bf16_dtype=self.bf16,
+            dtype=self.bf16,
             cpu_offload=self.cpu_offload,
-            tp_size=tp_size,
-            cp_size=cp_size,
-            pp_size=pp_size,
-            dp_size=dp_size,
-            dp_replicate_size=dp_replicate_size,
-            use_hf_tp_plan=use_hf_tp_plan,
+            fsdp_cfg=fsdp_cfg,
             optimizer_cfg=self.cfg.get("optim.optimizer", {}),
         )
 

From bd02816de91c8219f62cda799d8d995b89f11321 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 22:59:41 -0800
Subject: [PATCH 12/34] update NeMoWanPipeline

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
index b7eb979a..2f6351b7 100644
--- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
+++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
@@ -157,7 +157,7 @@ def from_pretrained(
         return pipe, created_managers
 
 
-class NeMoWanPipeline(WanPipeline):
+class NeMoWanPipeline:
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -179,7 +179,7 @@ def from_config(
         # Load just the config
         from diffusers import WanTransformer3DModel
 
-        if model_id is not None:
+        if config is None:
             transformer = WanTransformer3DModel.from_pretrained(
                 model_id,
                 subfolder="transformer",

From 09baf0f2931a9bd11925f610e38dc71f2af9ea74 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 23:03:56 -0800
Subject: [PATCH 13/34] rename

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/recipes/{finetune.py => train.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename dfm/src/automodel/recipes/{finetune.py => train.py} (100%)

diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/train.py
similarity index 100%
rename from dfm/src/automodel/recipes/finetune.py
rename to dfm/src/automodel/recipes/train.py

From 02ab4fc0caf67ec9feb347114ee6a7b31c1e8756 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 23:05:14 -0800
Subject: [PATCH 14/34] move examples

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/examples/Automodel/pretrain/pretrain.py                    | 2 +-
 {dfm/examples => examples}/automodel/finetune/finetune.py      | 2 +-
 .../automodel/finetune/wan2_1_t2v_flow.yaml                    | 0
 .../automodel/finetune/wan2_1_t2v_flow_multinode.yaml          | 0
 {dfm/examples => examples}/automodel/generate/wan_generate.py  | 0
 examples/dtensor/README.md                                     | 3 ---
 examples/dtensor/configs/README.md                             | 3 ---
 examples/dtensor/scripts/README.md                             | 3 ---
 8 files changed, 2 insertions(+), 11 deletions(-)
 rename {dfm/examples => examples}/automodel/finetune/finetune.py (94%)
 rename {dfm/examples => examples}/automodel/finetune/wan2_1_t2v_flow.yaml (100%)
 rename {dfm/examples => examples}/automodel/finetune/wan2_1_t2v_flow_multinode.yaml (100%)
 rename {dfm/examples => examples}/automodel/generate/wan_generate.py (100%)
 delete mode 100644 examples/dtensor/README.md
 delete mode 100644 examples/dtensor/configs/README.md
 delete mode 100644 examples/dtensor/scripts/README.md

diff --git a/dfm/examples/Automodel/pretrain/pretrain.py b/dfm/examples/Automodel/pretrain/pretrain.py
index f1038198..ec054d27 100644
--- a/dfm/examples/Automodel/pretrain/pretrain.py
+++ b/dfm/examples/Automodel/pretrain/pretrain.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from Automodel.recipes.train import TrainWan21DiffusionRecipe
+from automodel.recipes.train import TrainWan21DiffusionRecipe
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
 
diff --git a/dfm/examples/automodel/finetune/finetune.py b/examples/automodel/finetune/finetune.py
similarity index 94%
rename from dfm/examples/automodel/finetune/finetune.py
rename to examples/automodel/finetune/finetune.py
index 7d77162c..1f12c336 100644
--- a/dfm/examples/automodel/finetune/finetune.py
+++ b/examples/automodel/finetune/finetune.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from Automodel.recipes.train import TrainWan21DiffusionRecipe
+from automodel.recipes.train import TrainWan21DiffusionRecipe
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
 
diff --git a/dfm/examples/automodel/finetune/wan2_1_t2v_flow.yaml b/examples/automodel/finetune/wan2_1_t2v_flow.yaml
similarity index 100%
rename from dfm/examples/automodel/finetune/wan2_1_t2v_flow.yaml
rename to examples/automodel/finetune/wan2_1_t2v_flow.yaml
diff --git a/dfm/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml b/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
similarity index 100%
rename from dfm/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
rename to examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
diff --git a/dfm/examples/automodel/generate/wan_generate.py b/examples/automodel/generate/wan_generate.py
similarity index 100%
rename from dfm/examples/automodel/generate/wan_generate.py
rename to examples/automodel/generate/wan_generate.py
diff --git a/examples/dtensor/README.md b/examples/dtensor/README.md
deleted file mode 100644
index 709a9755..00000000
--- a/examples/dtensor/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# DTensor Models (NeMo Automodel)
-
-Examples using NeMo Automodel with distributed tensor parallelism.
diff --git a/examples/dtensor/configs/README.md b/examples/dtensor/configs/README.md
deleted file mode 100644
index c7df1772..00000000
--- a/examples/dtensor/configs/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Configs
-
-Configuration files for various Wan model versions.
diff --git a/examples/dtensor/scripts/README.md b/examples/dtensor/scripts/README.md
deleted file mode 100644
index 0a18e12b..00000000
--- a/examples/dtensor/scripts/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Scripts
-
-Training scripts for pretraining and finetuning.

From 6f64890d8782b3c601909dd78f8bfb59d1fc792d Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 23:08:30 -0800
Subject: [PATCH 15/34] move

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../Automodel => examples/automodel}/pretrain/pretrain.py         | 0
 .../automodel}/pretrain/wan2_1_t2v_flow.yaml                      | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename {dfm/examples/Automodel => examples/automodel}/pretrain/pretrain.py (100%)
 rename {dfm/examples/Automodel => examples/automodel}/pretrain/wan2_1_t2v_flow.yaml (100%)

diff --git a/dfm/examples/Automodel/pretrain/pretrain.py b/examples/automodel/pretrain/pretrain.py
similarity index 100%
rename from dfm/examples/Automodel/pretrain/pretrain.py
rename to examples/automodel/pretrain/pretrain.py
diff --git a/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
similarity index 100%
rename from dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml
rename to examples/automodel/pretrain/wan2_1_t2v_flow.yaml

From ec15d7e53ce1804e37b8d5e6400931a32a9033c5 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 23:09:43 -0800
Subject: [PATCH 16/34] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 examples/automodel/finetune/finetune.py | 2 +-
 examples/automodel/pretrain/pretrain.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/automodel/finetune/finetune.py b/examples/automodel/finetune/finetune.py
index 1f12c336..d1e840ee 100644
--- a/examples/automodel/finetune/finetune.py
+++ b/examples/automodel/finetune/finetune.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from automodel.recipes.train import TrainWan21DiffusionRecipe
+from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
 
diff --git a/examples/automodel/pretrain/pretrain.py b/examples/automodel/pretrain/pretrain.py
index ec054d27..e77f9ede 100644
--- a/examples/automodel/pretrain/pretrain.py
+++ b/examples/automodel/pretrain/pretrain.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from automodel.recipes.train import TrainWan21DiffusionRecipe
+from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
 

From c2299ce512a77ba05c48e1317a099c47818372b3 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 23:13:39 -0800
Subject: [PATCH 17/34] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 examples/automodel/finetune/finetune.py | 3 ++-
 examples/automodel/pretrain/pretrain.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/automodel/finetune/finetune.py b/examples/automodel/finetune/finetune.py
index d1e840ee..5c9da942 100644
--- a/examples/automodel/finetune/finetune.py
+++ b/examples/automodel/finetune/finetune.py
@@ -14,9 +14,10 @@
 
 from __future__ import annotations
 
-from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
+from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe
+
 
 def main(default_config_path="/opt/DFM/dfm/examples/Automodel/finetune/wan2_1_t2v_flow.yaml"):
     cfg = parse_args_and_load_config(default_config_path)
diff --git a/examples/automodel/pretrain/pretrain.py b/examples/automodel/pretrain/pretrain.py
index e77f9ede..f7a38930 100644
--- a/examples/automodel/pretrain/pretrain.py
+++ b/examples/automodel/pretrain/pretrain.py
@@ -14,9 +14,10 @@
 
 from __future__ import annotations
 
-from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
+from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe
+
 
 def main(default_config_path="/opt/DFM/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml"):
     cfg = parse_args_and_load_config(default_config_path)

From d66290599a7b71f035eefd6e2eca1cf26bc9553b Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 23:15:03 -0800
Subject: [PATCH 18/34] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 examples/automodel/pretrain/wan2_1_t2v_flow.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
index 0c44244f..88eabc76 100644
--- a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
+++ b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
@@ -62,4 +62,4 @@ checkpoint:
   checkpoint_dir: /opt/DFM/wan_t2v_flow_outputs_base_recipe_fsdp_run_1/
   model_save_format: torch_save
   save_consolidated: false
-  restore_from: null
\ No newline at end of file
+  restore_from: nul
\ No newline at end of file

From 8b727473cba19fb72f77db383606cdcd0c9b0594 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 23:16:18 -0800
Subject: [PATCH 19/34] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 examples/automodel/pretrain/wan2_1_t2v_flow.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
index 88eabc76..caf87ab6 100644
--- a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
+++ b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
@@ -62,4 +62,5 @@ checkpoint:
   checkpoint_dir: /opt/DFM/wan_t2v_flow_outputs_base_recipe_fsdp_run_1/
   model_save_format: torch_save
   save_consolidated: false
-  restore_from: nul
\ No newline at end of file
+  restore_from: null
+

From 99aa19ad9551786146309009290e37fe97c07b2e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Wed, 12 Nov 2025 23:17:56 -0800
Subject: [PATCH 20/34] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 examples/automodel/pretrain/wan2_1_t2v_flow.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
index caf87ab6..113d500f 100644
--- a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
+++ b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
@@ -63,4 +63,3 @@ checkpoint:
   model_save_format: torch_save
   save_consolidated: false
   restore_from: null
-

From 501c3d1ac263e3815fa15da30ae0ed1f63cb6330 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Thu, 13 Nov 2025 12:41:26 -0800
Subject: [PATCH 21/34] fix imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py | 2 +-
 dfm/src/automodel/datasets/__init__.py                  | 2 +-
 dfm/src/automodel/flow_matching/training_step_t2v.py    | 2 +-
 dfm/src/automodel/recipes/train.py                      | 4 ++--
 examples/automodel/generate/wan_generate.py             | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
index 2f6351b7..2d6f1756 100644
--- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
+++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
@@ -19,7 +19,7 @@
 
 import torch
 import torch.nn as nn
-from Automodel.distributed.dfm_parallelizer import WanParallelizationStrategy
+from dfm.src.automodel.distributed.dfm_parallelizer import WanParallelizationStrategy
 from diffusers import DiffusionPipeline, WanPipeline
 from nemo_automodel.components.distributed import parallelizer
 from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager
diff --git a/dfm/src/automodel/datasets/__init__.py b/dfm/src/automodel/datasets/__init__.py
index a3ef8358..051d4cd2 100644
--- a/dfm/src/automodel/datasets/__init__.py
+++ b/dfm/src/automodel/datasets/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from Automodel.datasets.wan21 import (
+from dfm.src.automodel.datasets.wan21 import (
     MetaFilesDataset,
     build_node_parallel_sampler,
     build_wan21_dataloader,
diff --git a/dfm/src/automodel/flow_matching/training_step_t2v.py b/dfm/src/automodel/flow_matching/training_step_t2v.py
index dc80ce74..8e4cce5c 100644
--- a/dfm/src/automodel/flow_matching/training_step_t2v.py
+++ b/dfm/src/automodel/flow_matching/training_step_t2v.py
@@ -19,7 +19,7 @@
 from typing import Dict, Tuple
 
 import torch
-from Automodel.flow_matching.time_shift_utils import (
+from dfm.src.automodel.flow_matching.time_shift_utils import (
     compute_density_for_timestep_sampling,
 )
 
diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py
index 5490e595..8f560fa3 100644
--- a/dfm/src/automodel/recipes/train.py
+++ b/dfm/src/automodel/recipes/train.py
@@ -22,8 +22,8 @@
 import torch
 import torch.distributed as dist
 import wandb
-from Automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline
-from Automodel.flow_matching.training_step_t2v import (
+from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline
+from dfm.src.automodel.flow_matching.training_step_t2v import (
     step_fsdp_transformer_t2v,
 )
 from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig
diff --git a/examples/automodel/generate/wan_generate.py b/examples/automodel/generate/wan_generate.py
index 2868ef9b..ae5c928a 100644
--- a/examples/automodel/generate/wan_generate.py
+++ b/examples/automodel/generate/wan_generate.py
@@ -18,7 +18,7 @@
 
 import torch
 import torch.distributed as dist
-from Automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
+from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
 from diffusers import AutoencoderKLWan
 from diffusers.utils import export_to_video
 from nemo_automodel.components.distributed.init_utils import initialize_distributed

From 298ee2d65f655d806f6c76654df120e6c2933c15 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Thu, 13 Nov 2025 12:46:07 -0800
Subject: [PATCH 22/34] lint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../automodel/_diffusers/auto_diffusion_pipeline.py   |  3 ++-
 dfm/src/automodel/flow_matching/training_step_t2v.py  |  1 +
 dfm/src/automodel/recipes/train.py                    | 11 ++++++-----
 examples/automodel/generate/wan_generate.py           |  3 ++-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
index 2d6f1756..cb9e9d00 100644
--- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
+++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
@@ -19,12 +19,13 @@
 
 import torch
 import torch.nn as nn
-from dfm.src.automodel.distributed.dfm_parallelizer import WanParallelizationStrategy
 from diffusers import DiffusionPipeline, WanPipeline
 from nemo_automodel.components.distributed import parallelizer
 from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager
 from nemo_automodel.shared.utils import dtype_from_str
 
+from dfm.src.automodel.distributed.dfm_parallelizer import WanParallelizationStrategy
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/dfm/src/automodel/flow_matching/training_step_t2v.py b/dfm/src/automodel/flow_matching/training_step_t2v.py
index 8e4cce5c..18cce361 100644
--- a/dfm/src/automodel/flow_matching/training_step_t2v.py
+++ b/dfm/src/automodel/flow_matching/training_step_t2v.py
@@ -19,6 +19,7 @@
 from typing import Dict, Tuple
 
 import torch
+
 from dfm.src.automodel.flow_matching.time_shift_utils import (
     compute_density_for_timestep_sampling,
 )
diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py
index 8f560fa3..474cfe98 100644
--- a/dfm/src/automodel/recipes/train.py
+++ b/dfm/src/automodel/recipes/train.py
@@ -21,11 +21,6 @@
 
 import torch
 import torch.distributed as dist
-import wandb
-from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline
-from dfm.src.automodel.flow_matching.training_step_t2v import (
-    step_fsdp_transformer_t2v,
-)
 from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig
 from nemo_automodel.components.loggers.log_utils import setup_logging
 from nemo_automodel.components.loggers.wandb_utils import suppress_wandb_log_messages
@@ -36,6 +31,12 @@
 from torch.distributed.fsdp import MixedPrecisionPolicy
 from transformers.utils.hub import TRANSFORMERS_CACHE
 
+import wandb
+from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline
+from dfm.src.automodel.flow_matching.training_step_t2v import (
+    step_fsdp_transformer_t2v,
+)
+
 
 def build_model_and_optimizer(
     *,
diff --git a/examples/automodel/generate/wan_generate.py b/examples/automodel/generate/wan_generate.py
index ae5c928a..829ff308 100644
--- a/examples/automodel/generate/wan_generate.py
+++ b/examples/automodel/generate/wan_generate.py
@@ -18,12 +18,13 @@
 
 import torch
 import torch.distributed as dist
-from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
 from diffusers import AutoencoderKLWan
 from diffusers.utils import export_to_video
 from nemo_automodel.components.distributed.init_utils import initialize_distributed
 from nemo_automodel.components.loggers.log_utils import setup_logging
 
+from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Wan2.2 T2V FSDP2 generation")

From 819d7d60aed06aef49756255d1cda8eaa65a9a0e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Thu, 13 Nov 2025 12:48:12 -0800
Subject: [PATCH 23/34] more lint

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/recipes/train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py
index 474cfe98..e24aa1c9 100644
--- a/dfm/src/automodel/recipes/train.py
+++ b/dfm/src/automodel/recipes/train.py
@@ -31,7 +31,6 @@
 from torch.distributed.fsdp import MixedPrecisionPolicy
 from transformers.utils.hub import TRANSFORMERS_CACHE
 
-import wandb
 from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline
 from dfm.src.automodel.flow_matching.training_step_t2v import (
     step_fsdp_transformer_t2v,

From 6554e4748236169cb59a46fed004001d12d65ce5 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Thu, 13 Nov 2025 15:32:42 -0800
Subject: [PATCH 24/34] fix import

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 examples/automodel/finetune/wan2_1_t2v_flow.yaml           | 2 +-
 examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml | 2 +-
 examples/automodel/pretrain/wan2_1_t2v_flow.yaml           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/automodel/finetune/wan2_1_t2v_flow.yaml b/examples/automodel/finetune/wan2_1_t2v_flow.yaml
index 6b4e3528..cced17b9 100644
--- a/examples/automodel/finetune/wan2_1_t2v_flow.yaml
+++ b/examples/automodel/finetune/wan2_1_t2v_flow.yaml
@@ -14,7 +14,7 @@ model:
 
 data:
   dataloader:
-    _target_: Automodel.datasets.build_wan21_dataloader
+    _target_: dfm.src.automodel.datasets.build_wan21_dataloader
     meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/
     batch_size: 1
     num_workers: 2
diff --git a/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml b/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
index 16d4793a..20539da5 100644
--- a/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
+++ b/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
@@ -14,7 +14,7 @@ model:
 
 data:
   dataloader:
-    _target_: Automodel.datasets.build_wan21_dataloader
+    _target_: dfm.src.automodel.datasets.build_wan21_dataloader
     meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/
     batch_size: 1
     num_workers: 2
diff --git a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
index 113d500f..eeabb29a 100644
--- a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
+++ b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
@@ -15,7 +15,7 @@ model:
 
 data:
   dataloader:
-    _target_: Automodel.datasets.build_wan21_dataloader
+    _target_: dfm.src.automodel.datasets.build_wan21_dataloader
     meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/
     batch_size: 1
     num_workers: 2
@@ -51,7 +51,7 @@ fsdp:
   cp_size: 1
   pp_size: 1
   dp_replicate_size: 1
-  dp_size: 8
+  dp_size: none
 
 logging:
   save_every: 1000

From f6600da867ee466a96ea6efd1024399534cc3a7e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Thu, 13 Nov 2025 15:46:06 -0800
Subject: [PATCH 25/34] fix 3rdparty & pyproject

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .gitmodules              | 3 ---
 3rdparty/Automodel       | 1 -
 3rdparty/Megatron-Bridge | 2 +-
 3 files changed, 1 insertion(+), 5 deletions(-)
 delete mode 160000 3rdparty/Automodel

diff --git a/.gitmodules b/.gitmodules
index 8ad240e1..274454c7 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
 [submodule "3rdparty/Automodel"]
 	path = 3rdparty/Automodel
 	url = https://github.com/NVIDIA-NeMo/Automodel.git
-[submodule "3rdparty/Megatron-Bridge"]
-	path = 3rdparty/Megatron-Bridge
-	url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
diff --git a/3rdparty/Automodel b/3rdparty/Automodel
deleted file mode 160000
index a5f06522..00000000
--- a/3rdparty/Automodel
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a5f06522d4f8ef67bb9bbdd9502e50ae27d2fee5
diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge
index 4e4ce420..8e21f81a 160000
--- a/3rdparty/Megatron-Bridge
+++ b/3rdparty/Megatron-Bridge
@@ -1 +1 @@
-Subproject commit 4e4ce4203589466d0a5b846e12dd24fa74c57f2a
+Subproject commit 8e21f81ab961bdb0ad99a275074fe50aae15d2f9

From 4935ec638e9e249dad4b531a3fce30f095c644fc Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 14 Nov 2025 09:46:59 -0800
Subject: [PATCH 26/34] add torch

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 pyproject.toml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 05a40a68..5e403755 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,6 +96,11 @@ automodel = [
     "nemo-automodel",
 ]
 megatron-bridge = ["megatron-bridge"]
+torch-cu124 = [
+    "torch",
+    "torchvision",
+    "torchaudio",
+]
 
 [tool.setuptools]
 packages = ["dfm"]
@@ -124,6 +129,11 @@ override-dependencies = [
     "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
 ]
 
+[[tool.uv.index]]
+name = "pytorch-cu124"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
+
 [[tool.uv.index]]
 name = "pypi"
 url = "https://pypi.org/simple"

From c286199f39a1d061161bb15d073f8434b9672154 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 14 Nov 2025 09:47:08 -0800
Subject: [PATCH 27/34] update uv.lock

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 uv.lock | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/uv.lock b/uv.lock
index 02355563..b6443987 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3494,6 +3494,11 @@ test = [
     { name = "pytest-mock" },
     { name = "pytest-runner" },
 ]
+torch-cu124 = [
+    { name = "torch", marker = "sys_platform == 'never'" },
+    { name = "torchaudio" },
+    { name = "torchvision", marker = "sys_platform == 'never'" },
+]
 
 [package.metadata]
 requires-dist = [
@@ -3542,6 +3547,11 @@ test = [
     { name = "pytest-mock", specifier = ">=3.14.0" },
     { name = "pytest-runner", specifier = ">=6.0.1" },
 ]
+torch-cu124 = [
+    { name = "torch" },
+    { name = "torchaudio" },
+    { name = "torchvision" },
+]
 
 [[package]]
 name = "networkx"
@@ -6239,7 +6249,7 @@ wheels = [
 
 [[package]]
 name = "torch"
-version = "2.9.1"
+version = "2.9.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
@@ -6262,6 +6272,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/a7/b888635fbb6ae951cffd41e1318966cbed96ec762b4999815ab68269e23f/torchao-0.14.1-py3-none-any.whl", hash = "sha256:c9896e14531817bc2ca6847b3fe71c42592ab80a43628b36668b2d6d6713fb5b", size = 1067611, upload-time = "2025-10-24T01:03:01.357Z" },
 ]
 
+[[package]]
+name = "torchaudio"
+version = "2.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "torch", marker = "sys_platform == 'never'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1c/87/7de58c8f4c1946ec4d9070354eae73d1e4f3d2426e5cfa45febbd8451ce5/torchaudio-2.9.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd13541197e035338bd43225b2067532056486d357c661e12d49ace4fc37f8bb", size = 805912, upload-time = "2025-11-12T15:25:47.857Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/1b/680ca01211a39746aedf54e475783f846fbd7961dfeb17bce7d123f931f0/torchaudio-2.9.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:31ec46b718b7caa0182221bfb42e2ad223947b752a996dcdc0388c34a678c966", size = 472829, upload-time = "2025-11-12T15:25:46.519Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/ee/d71e6d78d203d72f99c426fbbf2bcd801cf084d8f1891bb1f42c95bc5ec5/torchaudio-2.9.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ee11695b367f64638b4a0340cc9abb9be2173c6537bfe4ab286c6fbff68a1444", size = 2055454, upload-time = "2025-11-12T15:25:50.519Z" },
+    { url = "https://files.pythonhosted.org/packages/19/43/dcfadd58a21704835da8bcc43bbb999887a7a1f8965aab527bd50459272c/torchaudio-2.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:acffac66d0908baa4ef16ce5ce6d2a7bc10c2534fce719b146744f306ba08c4a", size = 663868, upload-time = "2025-11-12T15:25:51.755Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/6b/34e489fcb4adc4b571a166f2670cc7f156cbe3337867a892fade0a1a5224/torchaudio-2.9.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6e3f5943135701168d30196e2befd46290180cdbb9ee508b167730d51f43208f", size = 807349, upload-time = "2025-11-12T15:25:57.843Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/52/66830da8b638368bc0aef064f3307c88d28b526ff8e60a1fda681466b1b3/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d192cf3b1b677f6666dad60caf0ce7bab66965751570c694645dd905a6c61724", size = 474291, upload-time = "2025-11-12T15:25:45.21Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/6f/d8f1f36c9f63ddef78f00f8f8ddb9638128ceb5f6824c28bead5af48fc63/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8327e21f51dced2b6de3ac6a63f04bae9be9bc213e151f85c76164568c7ebc3d", size = 2058677, upload-time = "2025-11-12T15:25:53.09Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/ef/0ec42e783774bd1dda8bc2489e18b3e9c0a250384e0131cec9f35949f385/torchaudio-2.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:b41339a71b186bad238d94cfb68d4c202db0033088a7b824ce5484674bf67057", size = 664681, upload-time = "2025-11-12T15:25:59.08Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/83/71cbadd7b66753818b5775f2088bad4f721d581de276996df4968000a626/torchaudio-2.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7581ef170794c599aed55918e00d0acd9e5c9a0f19400c9a9a840955180365c5", size = 808098, upload-time = "2025-11-12T15:26:01.408Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/2d/32e8bec360459107f9b451cc1a5b6fdd5f1d3e653e65a111502084f21e3a/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:742f9d24db5f1f46d8c7e29c599fe55b866d92c4a8181fcb95eab12da225ceb0", size = 474604, upload-time = "2025-11-12T15:25:49.122Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/0d/b5af1d55ede1ca07769a2cf71256073d8958e2a5521fc734fc19f5343283/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4533fdafba73d7bcfcb5f1225b2cc8974a290ed0fe54c44638d6f440e91b8999", size = 2059899, upload-time = "2025-11-12T15:26:19.363Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/7c/df90eb0b337cbad59296ed91778e32be069330f5186256d4ce9ea603d324/torchaudio-2.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:923dccc67be4a6cbb45c3dcc2d69ee182bda75b09b69bc88cd3bcdfc739883a2", size = 665337, upload-time = "2025-11-12T15:26:07.407Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/1b/3321ad6379ac2d968064704e8d015c31ccae5d1ece070f87fb44b17d90e6/torchaudio-2.9.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bb69557484c92513a980027ec4cb314b0f43cf4442bbfd97440e66528dbad22d", size = 808136, upload-time = "2025-11-12T15:26:00.276Z" },
+    { url = "https://files.pythonhosted.org/packages/76/e2/fe55b3882157fd57aa131f5bcad90f0329be90827e1c0e0c482662ddef38/torchaudio-2.9.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ba2799ceec5e4373a0aa26df30d608f1eaaefd8ac4a7ae0c3446f63106f5b5a5", size = 474349, upload-time = "2025-11-12T15:26:02.78Z" },
+    { url = "https://files.pythonhosted.org/packages/74/d3/0b090c03cac5a20691507e0945589a696fb10402ccd2457eea47dbf8a71b/torchaudio-2.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:bc3c8e9a240bfad8bc61f769324a4f3ce5d60eec161369d457c595c35dbb10c7", size = 2060343, upload-time = "2025-11-12T15:26:03.88Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/db/2555cfd428f4bf09a4df1c6f9204d0acc217c46edb35776c16e7a2a9a1c9/torchaudio-2.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:13ee96ea9bbbc85e198cb671273af06f010e6981d7b912d001eef6bc74e23f4f", size = 665301, upload-time = "2025-11-12T15:26:04.952Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/58/e82d8b5f447abdddc950965f1395f36baef3602643dd069100c6369ba73e/torchaudio-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9290f6a6409deb1f9113d5aef97ec646eeee6410b6bcc57ab8b57066b54da7c1", size = 813456, upload-time = "2025-11-12T15:26:13.963Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/45/dd9ad6af9bb595095cd98028d270f933760968b92a3497282e31289ef3b4/torchaudio-2.9.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:eeae7ca60b64c4bfb78fbd104a089d072b151423d5d2f90da1da00787f03b800", size = 476577, upload-time = "2025-11-12T15:26:09.54Z" },
+    { url = "https://files.pythonhosted.org/packages/79/97/c49aeb01d8a9ced2b8215a38b69b8eafd1afe295a487a73b7030c6ff3396/torchaudio-2.9.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:5f445e896215e6f7bba497dc68aab1e6cb077ae0ab3a90095067f16df6a9bb98", size = 2062158, upload-time = "2025-11-12T15:26:10.487Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/70/30b2a0ecca2a0a5e6a8cee8952fdea3872854ea5bcd86fe3df369fdc2543/torchaudio-2.9.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c558ba70d548f7491245ed7a35310f6310d83fc7591f073ab5fed9fd38cef987", size = 669253, upload-time = "2025-11-12T15:26:06.285Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/38/0dabf362f946ab5773d3db3322718d652d70ad12a82f500d54c6c8b9cc88/torchaudio-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:69a582650279ee16ff9087f99b4234fe5d766e1bf7f0be352db5f46991854c1e", size = 810496, upload-time = "2025-11-12T15:26:11.515Z" },
+    { url = "https://files.pythonhosted.org/packages/05/1c/e05a32ee6868dc05463242db672f23dba5d042423fefcf294db4dac343a8/torchaudio-2.9.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:9c0d004f784c49078017f8217fdc901df0eb9724e50fb269b3a6c99b1d4eae75", size = 474566, upload-time = "2025-11-12T15:26:08.628Z" },
+    { url = "https://files.pythonhosted.org/packages/15/52/8cec1fe90f05b888f9060467e1eb8c27f9295b8729a83d443e3bd7c471d3/torchaudio-2.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d2743b28ff5538d5fdf2ff6657d392852ccdfe640ede46f566b2907ca32d8dca", size = 2060358, upload-time = "2025-11-12T15:26:12.885Z" },
+    { url = "https://files.pythonhosted.org/packages/04/73/6ba396813d714f895f86c82be61b590fbe14255ebe6866f5ea5916c075a3/torchaudio-2.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:234c7a9d4d0a6ed735cd37965baa9a89ca36bdbebece8a6a5ff7727acbb43026", size = 665039, upload-time = "2025-11-12T15:26:18.308Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/f6/237e00a04dea497a40a8567d024dfb39193abec3ca3695ad51919ad633d1/torchaudio-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e13cb38971ac259fc4e102282a3e48f6df5f0ab00eb785ca5155e3392d1e86f1", size = 813463, upload-time = "2025-11-12T15:26:16.261Z" },
+    { url = "https://files.pythonhosted.org/packages/57/99/5fcd46a80086030899badeb5a934fab337c88325b3f68c60faa0b672d4d2/torchaudio-2.9.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:35c96ed1011b50eaf17948da173b09450cdc5bb7f908687571adb4a4c072c05e", size = 476577, upload-time = "2025-11-12T15:26:17.355Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/4c/bc428f71d5ef728fba2ecb151a3a6d187e6f0b9446b76e4f87e46d2206a3/torchaudio-2.9.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:c220c4acf9914cce2dc81c3624d7c84008ef436dc31bcbb89e8f4416d3615a34", size = 2062170, upload-time = "2025-11-12T15:26:20.837Z" },
+    { url = "https://files.pythonhosted.org/packages/07/0e/be41f412e1225bdbd9b7fd7f41a20f070c707f5274b82542eeccf6dc2b79/torchaudio-2.9.1-cp314-cp314t-win_amd64.whl", hash = "sha256:cfd12934c7b54b41d4c79dfd26fbfe88fafa9cc5cc77c074e953bb7018d9322c", size = 669265, upload-time = "2025-11-12T15:26:14.976Z" },
+]
+
 [[package]]
 name = "torchdata"
 version = "0.11.0"
@@ -6291,7 +6339,7 @@ wheels = [
 
 [[package]]
 name = "torchvision"
-version = "0.24.1"
+version = "0.24.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },

From 6c6e35d289e2ab9fdafeaa77b986fe2ca6e44012 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 14 Nov 2025 09:48:12 -0800
Subject: [PATCH 28/34] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/recipes/train.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py
index e24aa1c9..8f9f4c34 100644
--- a/dfm/src/automodel/recipes/train.py
+++ b/dfm/src/automodel/recipes/train.py
@@ -31,6 +31,7 @@
 from torch.distributed.fsdp import MixedPrecisionPolicy
 from transformers.utils.hub import TRANSFORMERS_CACHE
 
+import wandb
 from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline
 from dfm.src.automodel.flow_matching.training_step_t2v import (
     step_fsdp_transformer_t2v,
@@ -119,7 +120,7 @@ def build_model_and_optimizer(
 
     logging.info("[INFO] NeMoAutoDiffusion setup complete (pipeline + optimizer)")
 
-    return pipe, optimizer, fsdp2_manager.device_mesh
+    return pipe, optimizer, getattr(fsdp2_manager, "device_mesh", None)
 
 
 def build_lr_scheduler(
@@ -272,6 +273,9 @@ def setup(self):
             raise RuntimeError("Training dataloader is empty; cannot proceed with training")
 
         # Derive DP size consistent with model parallel config
+        tp_size = fsdp_cfg.get("tp_size", 1)
+        cp_size = fsdp_cfg.get("cp_size", 1)
+        pp_size = fsdp_cfg.get("pp_size", 1)
         denom = max(1, tp_size * cp_size * pp_size)
         self.dp_size = fsdp_cfg.get("dp_size", None)
         if self.dp_size is None:

From 3f8b64fe1ca66a53f06842faeb12568ac65bc57e Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 14 Nov 2025 09:56:32 -0800
Subject: [PATCH 29/34] update

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 uv.lock | 223 +++-----------------------------------------------------
 1 file changed, 9 insertions(+), 214 deletions(-)

diff --git a/uv.lock b/uv.lock
index b6443987..5a28703f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2656,7 +2656,8 @@ wheels = [
 
 [[package]]
 name = "megatron-bridge"
-source = { directory = "3rdparty/Megatron-Bridge" }
+version = "0.3.0rc0"
+source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?rev=b245756e35943083a77aff6217fe60dd1704f6ca#b245756e35943083a77aff6217fe60dd1704f6ca" }
 dependencies = [
     { name = "causal-conv1d" },
     { name = "datasets" },
@@ -2678,70 +2679,10 @@ dependencies = [
     { name = "wandb" },
 ]
 
-[package.metadata]
-requires-dist = [
-    { name = "causal-conv1d" },
-    { name = "datasets" },
-    { name = "hydra-core", specifier = ">1.3,<=1.3.2" },
-    { name = "mamba-ssm" },
-    { name = "megatron-core", extras = ["dev", "mlm"], directory = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM" },
-    { name = "nemo-run", marker = "extra == 'recipes'", specifier = ">=0.5.0a0,<0.6.0" },
-    { name = "nvdlfw-inspect", marker = "extra == 'tensor-inspect'", specifier = "==0.2.1" },
-    { name = "nvidia-resiliency-ext", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=54f85fe422d296cf04ea524130014bd3a2c3add1" },
-    { name = "omegaconf", specifier = ">=2.3.0" },
-    { name = "pyyaml", specifier = ">=6.0.2" },
-    { name = "qwen-vl-utils" },
-    { name = "regex", specifier = ">=2024.11.6" },
-    { name = "rich" },
-    { name = "six", specifier = ">=1.17.0" },
-    { name = "tensorboard", specifier = ">=2.19.0" },
-    { name = "tqdm", specifier = ">=4.67.1" },
-    { name = "transformer-engine", extras = ["pytorch"], git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" },
-    { name = "transformers", specifier = ">=4.57.1" },
-    { name = "typing-extensions" },
-    { name = "wandb", specifier = ">=0.19.10" },
-]
-provides-extras = ["recipes", "tensor-inspect"]
-
-[package.metadata.requires-dev]
-build = [
-    { name = "cython", specifier = ">=3.0.0" },
-    { name = "ninja" },
-    { name = "numpy", specifier = "<2.0.0" },
-    { name = "nvidia-mathdx" },
-    { name = "pybind11" },
-    { name = "setuptools" },
-    { name = "torch" },
-]
-dev = [
-    { name = "mypy", specifier = ">=1.8.0" },
-    { name = "pre-commit", specifier = ">=3.6.0" },
-    { name = "ruff", specifier = ">=0.9.9" },
-]
-docs = [
-    { name = "myst-parser", specifier = ">=4.0.1" },
-    { name = "nvidia-sphinx-theme", specifier = ">=0.0.8" },
-    { name = "sphinx", specifier = ">=8.1.3" },
-    { name = "sphinx-autobuild", specifier = ">=2024.10.3" },
-    { name = "sphinx-autodoc2", specifier = ">=0.5.0" },
-    { name = "sphinx-copybutton", specifier = ">=0.5.2" },
-    { name = "sphinxcontrib-mermaid" },
-]
-test = [
-    { name = "click" },
-    { name = "coverage", specifier = ">=7.8.1" },
-    { name = "flake8", specifier = ">=7.2.0" },
-    { name = "pygithub" },
-    { name = "pylint", specifier = ">=3.3.7" },
-    { name = "pytest", specifier = ">=8.3.5" },
-    { name = "pytest-mock", specifier = ">=3.14.0" },
-    { name = "pytest-runner", specifier = ">=6.0.1" },
-    { name = "pytest-timeout", specifier = ">=2.4.0" },
-]
-
 [[package]]
 name = "megatron-core"
-source = { directory = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM" }
+version = "0.16.0rc0"
+source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?subdirectory=3rdparty%2FMegatron-LM&rev=b245756e35943083a77aff6217fe60dd1704f6ca#b245756e35943083a77aff6217fe60dd1704f6ca" }
 dependencies = [
     { name = "numpy" },
     { name = "packaging" },
@@ -2781,94 +2722,6 @@ mlm = [
     { name = "wandb" },
 ]
 
-[package.metadata]
-requires-dist = [
-    { name = "av", marker = "extra == 'dev'", specifier = "<16.0.0" },
-    { name = "causal-conv1d", marker = "extra == 'dev'", specifier = "~=1.5" },
-    { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" },
-    { name = "einops", marker = "extra == 'lts'" },
-    { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=cf9909b777ffac18e05b67a6708282cadc000942" },
-    { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.3.2" },
-    { name = "flashinfer-python", marker = "extra == 'dev'" },
-    { name = "flask-restful", marker = "extra == 'mlm'" },
-    { name = "mamba-ssm", marker = "extra == 'dev'", specifier = "~=2.2" },
-    { name = "megatron-energon", extras = ["av-decode"], marker = "extra == 'dev'", specifier = "~=6.0" },
-    { name = "multi-storage-client", marker = "extra == 'dev'", specifier = "~=0.27" },
-    { name = "numpy", specifier = "<2.0.0" },
-    { name = "nv-grouped-gemm", marker = "extra == 'dev'", specifier = "~=1.1" },
-    { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin' and extra == 'dev'", specifier = ">=0.33.0a0,<0.34.0" },
-    { name = "nvidia-resiliency-ext", marker = "extra == 'dev'", specifier = ">=0.4.0a0,<0.5.0" },
-    { name = "nvtx", marker = "extra == 'dev'", specifier = "~=0.2" },
-    { name = "nvtx", marker = "extra == 'lts'" },
-    { name = "onnxscript", marker = "extra == 'dev'" },
-    { name = "opentelemetry-api", marker = "extra == 'dev'", specifier = "~=1.33.1" },
-    { name = "packaging", specifier = ">=24.2" },
-    { name = "sentencepiece", marker = "extra == 'mlm'" },
-    { name = "setuptools", marker = "extra == 'dev'", specifier = "<80.0.0" },
-    { name = "setuptools", marker = "extra == 'lts'", specifier = "<80.0.0" },
-    { name = "tensorstore", marker = "extra == 'dev'", specifier = "~=0.1,!=0.1.46,!=0.1.72" },
-    { name = "tensorstore", marker = "extra == 'lts'", specifier = "!=0.1.46,!=0.1.72" },
-    { name = "tiktoken", marker = "extra == 'mlm'" },
-    { name = "torch" },
-    { name = "tqdm", marker = "extra == 'dev'" },
-    { name = "tqdm", marker = "extra == 'lts'" },
-    { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" },
-    { name = "transformers", marker = "extra == 'lts'" },
-    { name = "transformers", marker = "extra == 'mlm'" },
-    { name = "wandb", marker = "extra == 'mlm'" },
-    { name = "wget", marker = "extra == 'dev'" },
-    { name = "wget", marker = "extra == 'lts'" },
-    { name = "zarr", marker = "extra == 'lts'" },
-]
-provides-extras = ["mlm", "dev", "lts"]
-
-[package.metadata.requires-dev]
-build = [
-    { name = "cython", specifier = ">=3.0.0" },
-    { name = "hatchling" },
-    { name = "nvidia-mathdx" },
-    { name = "packaging", specifier = ">=24.2" },
-    { name = "pybind11" },
-    { name = "setuptools", specifier = "<80.0.0" },
-    { name = "torch" },
-]
-ci = [
-    { name = "pandas" },
-    { name = "python-gitlab" },
-    { name = "slack-sdk" },
-]
-docs = [
-    { name = "myst-parser" },
-    { name = "nvidia-sphinx-theme" },
-    { name = "sphinx" },
-    { name = "sphinx-autobuild" },
-    { name = "sphinx-autodoc2" },
-    { name = "sphinx-copybutton" },
-]
-flash-mla = [{ name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" }]
-linting = [
-    { name = "black", specifier = "==24.4.2" },
-    { name = "flake8", specifier = "==7.1.0" },
-    { name = "isort", specifier = "==5.13.2" },
-    { name = "pylint", specifier = "==3.2.6" },
-    { name = "ruff", specifier = "~=0.9.0" },
-]
-test = [
-    { name = "coverage" },
-    { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=01a9a8ba360f7b2908728ad0516e0ad9d936966d" },
-    { name = "nltk" },
-    { name = "pydantic" },
-    { name = "pygithub" },
-    { name = "pytest", specifier = "==8.3.5" },
-    { name = "pytest-asyncio" },
-    { name = "pytest-cov" },
-    { name = "pytest-mock" },
-    { name = "pytest-random-order" },
-    { name = "pyyaml" },
-    { name = "tensorboard" },
-    { name = "wrapt" },
-]
-
 [[package]]
 name = "megatron-energon"
 version = "6.0.1"
@@ -3360,7 +3213,8 @@ wheels = [
 
 [[package]]
 name = "nemo-automodel"
-source = { directory = "3rdparty/Automodel" }
+version = "0.1.0rc0"
+source = { git = "https://github.com/NVIDIA-NeMo/Automodel.git?rev=17055ab4fe820ba0e4c13aed8fd2c810b6551bf7#17055ab4fe820ba0e4c13aed8fd2c810b6551bf7" }
 dependencies = [
     { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
     { name = "datasets" },
@@ -3380,65 +3234,6 @@ dependencies = [
     { name = "wandb" },
 ]
 
-[package.metadata]
-requires-dist = [
-    { name = "backoff", marker = "extra == 'vlm'" },
-    { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = "==0.45.5" },
-    { name = "datasets", specifier = ">=4.0.0" },
-    { name = "diffusers" },
-    { name = "flash-attn", marker = "extra == 'fa'", specifier = "<=2.8.3" },
-    { name = "ftfy" },
-    { name = "imageio-ffmpeg" },
-    { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = ">=0.5.9" },
-    { name = "megatron-fsdp" },
-    { name = "mistral-common", extras = ["opencv"], marker = "extra == 'vlm'" },
-    { name = "mlflow" },
-    { name = "numba", marker = "extra == 'vlm'" },
-    { name = "numpy", marker = "extra == 'vlm'" },
-    { name = "opencv-python-headless", specifier = "==4.10.0.84" },
-    { name = "pillow", marker = "extra == 'vlm'" },
-    { name = "pybind11" },
-    { name = "pyyaml" },
-    { name = "qwen-omni-utils", marker = "extra == 'vlm'" },
-    { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
-    { name = "timm", marker = "extra == 'vlm'", specifier = "==1.0.16" },
-    { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" },
-    { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" },
-    { name = "torchao" },
-    { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
-    { name = "torchdata" },
-    { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'moe'", specifier = "==2.8.0" },
-    { name = "transformers", specifier = "<=4.57.1" },
-    { name = "wandb" },
-]
-provides-extras = ["vlm", "fa", "moe"]
-
-[package.metadata.requires-dev]
-build = [
-    { name = "setuptools" },
-    { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" },
-    { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" },
-]
-dev = [{ name = "cut-cross-entropy", git = "https://github.com/apple/ml-cross-entropy.git?rev=87a86ab" }]
-docs = [
-    { name = "myst-parser" },
-    { name = "nvidia-sphinx-theme" },
-    { name = "sphinx" },
-    { name = "sphinx-autobuild" },
-    { name = "sphinx-autodoc2" },
-    { name = "sphinx-copybutton" },
-]
-linting = [
-    { name = "import-linter", specifier = "~=2.4" },
-    { name = "pre-commit", specifier = ">=4.2.0" },
-    { name = "ruff", specifier = "~=0.9.0" },
-]
-test = [
-    { name = "coverage" },
-    { name = "peft" },
-    { name = "pytest" },
-]
-
 [[package]]
 name = "nemo-dfm"
 source = { editable = "." }
@@ -3483,7 +3278,7 @@ docs = [
     { name = "sphinx-autodoc2" },
     { name = "sphinx-copybutton" },
 ]
-megatron-bridge = [
+megatronbridge = [
     { name = "megatron-bridge" },
 ]
 test = [
@@ -3513,7 +3308,7 @@ requires-dist = [
 ]
 
 [package.metadata.requires-dev]
-automodel = [{ name = "nemo-automodel", directory = "3rdparty/Automodel" }]
+automodel = [{ name = "nemo-automodel", git = "https://github.com/NVIDIA-NeMo/Automodel.git?rev=17055ab4fe820ba0e4c13aed8fd2c810b6551bf7" }]
 build = [
     { name = "cython", specifier = ">=3.0.0" },
     { name = "ninja" },
@@ -3538,7 +3333,7 @@ docs = [
     { name = "sphinx-autodoc2", specifier = ">=0.5.0" },
     { name = "sphinx-copybutton", specifier = ">=0.5.2" },
 ]
-megatron-bridge = [{ name = "megatron-bridge", directory = "3rdparty/Megatron-Bridge" }]
+megatronbridge = [{ name = "megatron-bridge", git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?rev=b245756e35943083a77aff6217fe60dd1704f6ca" }]
 test = [
     { name = "coverage", specifier = ">=7.8.1" },
     { name = "flake8", specifier = ">=7.2.0" },

From 494c3fb29512b741e40b43182c1c5c01fc432835 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Fri, 14 Nov 2025 09:58:56 -0800
Subject: [PATCH 30/34] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 dfm/src/automodel/recipes/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py
index 8f9f4c34..5a858fde 100644
--- a/dfm/src/automodel/recipes/train.py
+++ b/dfm/src/automodel/recipes/train.py
@@ -21,6 +21,7 @@
 
 import torch
 import torch.distributed as dist
+import wandb
 from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig
 from nemo_automodel.components.loggers.log_utils import setup_logging
 from nemo_automodel.components.loggers.wandb_utils import suppress_wandb_log_messages
@@ -31,7 +32,6 @@
 from torch.distributed.fsdp import MixedPrecisionPolicy
 from transformers.utils.hub import TRANSFORMERS_CACHE
 
-import wandb
 from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline
 from dfm.src.automodel.flow_matching.training_step_t2v import (
     step_fsdp_transformer_t2v,

From e3290e0bb6af25390dfea3bec89d01313121c4d6 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 17 Nov 2025 09:29:28 -0800
Subject: [PATCH 31/34] revert 3rdparty

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .gitmodules        | 3 +++
 3rdparty/Automodel | 1 +
 pyproject.toml     | 1 -
 3 files changed, 4 insertions(+), 1 deletion(-)
 create mode 160000 3rdparty/Automodel

diff --git a/.gitmodules b/.gitmodules
index 274454c7..8ad240e1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "3rdparty/Automodel"]
 	path = 3rdparty/Automodel
 	url = https://github.com/NVIDIA-NeMo/Automodel.git
+[submodule "3rdparty/Megatron-Bridge"]
+	path = 3rdparty/Megatron-Bridge
+	url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
diff --git a/3rdparty/Automodel b/3rdparty/Automodel
new file mode 160000
index 00000000..a5f06522
--- /dev/null
+++ b/3rdparty/Automodel
@@ -0,0 +1 @@
+Subproject commit a5f06522d4f8ef67bb9bbdd9502e50ae27d2fee5
diff --git a/pyproject.toml b/pyproject.toml
index 5e403755..82933f29 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -142,7 +142,6 @@ explicit = true
 [tool.uv.sources]
 nemo-automodel = { path = "3rdparty/Automodel" }
 megatron-bridge = { path = "3rdparty/Megatron-Bridge" }
-megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" }
 transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" }
 nvidia-resiliency-ext = { index = "pypi" }
 

From 90f9bbc3e812ee0d4fae9279c188df670bcb6ddf Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 17 Nov 2025 10:23:06 -0800
Subject: [PATCH 32/34] update uv.lock

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 uv.lock | 221 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 213 insertions(+), 8 deletions(-)

diff --git a/uv.lock b/uv.lock
index 5a28703f..ea6de63c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2656,8 +2656,7 @@ wheels = [
 
 [[package]]
 name = "megatron-bridge"
-version = "0.3.0rc0"
-source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?rev=b245756e35943083a77aff6217fe60dd1704f6ca#b245756e35943083a77aff6217fe60dd1704f6ca" }
+source = { directory = "3rdparty/Megatron-Bridge" }
 dependencies = [
     { name = "causal-conv1d" },
     { name = "datasets" },
@@ -2679,10 +2678,70 @@ dependencies = [
     { name = "wandb" },
 ]
 
+[package.metadata]
+requires-dist = [
+    { name = "causal-conv1d" },
+    { name = "datasets" },
+    { name = "hydra-core", specifier = ">1.3,<=1.3.2" },
+    { name = "mamba-ssm" },
+    { name = "megatron-core", extras = ["dev", "mlm"], directory = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM" },
+    { name = "nemo-run", marker = "extra == 'recipes'", specifier = ">=0.5.0a0,<0.6.0" },
+    { name = "nvdlfw-inspect", marker = "extra == 'tensor-inspect'", specifier = "==0.2.1" },
+    { name = "nvidia-resiliency-ext", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=54f85fe422d296cf04ea524130014bd3a2c3add1" },
+    { name = "omegaconf", specifier = ">=2.3.0" },
+    { name = "pyyaml", specifier = ">=6.0.2" },
+    { name = "qwen-vl-utils" },
+    { name = "regex", specifier = ">=2024.11.6" },
+    { name = "rich" },
+    { name = "six", specifier = ">=1.17.0" },
+    { name = "tensorboard", specifier = ">=2.19.0" },
+    { name = "tqdm", specifier = ">=4.67.1" },
+    { name = "transformer-engine", extras = ["pytorch"], git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" },
+    { name = "transformers", specifier = ">=4.57.1" },
+    { name = "typing-extensions" },
+    { name = "wandb", specifier = ">=0.19.10" },
+]
+provides-extras = ["recipes", "tensor-inspect"]
+
+[package.metadata.requires-dev]
+build = [
+    { name = "cython", specifier = ">=3.0.0" },
+    { name = "ninja" },
+    { name = "numpy", specifier = "<2.0.0" },
+    { name = "nvidia-mathdx" },
+    { name = "pybind11" },
+    { name = "setuptools" },
+    { name = "torch" },
+]
+dev = [
+    { name = "mypy", specifier = ">=1.8.0" },
+    { name = "pre-commit", specifier = ">=3.6.0" },
+    { name = "ruff", specifier = ">=0.9.9" },
+]
+docs = [
+    { name = "myst-parser", specifier = ">=4.0.1" },
+    { name = "nvidia-sphinx-theme", specifier = ">=0.0.8" },
+    { name = "sphinx", specifier = ">=8.1.3" },
+    { name = "sphinx-autobuild", specifier = ">=2024.10.3" },
+    { name = "sphinx-autodoc2", specifier = ">=0.5.0" },
+    { name = "sphinx-copybutton", specifier = ">=0.5.2" },
+    { name = "sphinxcontrib-mermaid" },
+]
+test = [
+    { name = "click" },
+    { name = "coverage", specifier = ">=7.8.1" },
+    { name = "flake8", specifier = ">=7.2.0" },
+    { name = "pygithub" },
+    { name = "pylint", specifier = ">=3.3.7" },
+    { name = "pytest", specifier = ">=8.3.5" },
+    { name = "pytest-mock", specifier = ">=3.14.0" },
+    { name = "pytest-runner", specifier = ">=6.0.1" },
+    { name = "pytest-timeout", specifier = ">=2.4.0" },
+]
+
 [[package]]
 name = "megatron-core"
-version = "0.16.0rc0"
-source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?subdirectory=3rdparty%2FMegatron-LM&rev=b245756e35943083a77aff6217fe60dd1704f6ca#b245756e35943083a77aff6217fe60dd1704f6ca" }
+source = { directory = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM" }
 dependencies = [
     { name = "numpy" },
     { name = "packaging" },
@@ -2722,6 +2781,94 @@ mlm = [
     { name = "wandb" },
 ]
 
+[package.metadata]
+requires-dist = [
+    { name = "av", marker = "extra == 'dev'", specifier = "<16.0.0" },
+    { name = "causal-conv1d", marker = "extra == 'dev'", specifier = "~=1.5" },
+    { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" },
+    { name = "einops", marker = "extra == 'lts'" },
+    { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=cf9909b777ffac18e05b67a6708282cadc000942" },
+    { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.3.2" },
+    { name = "flashinfer-python", marker = "extra == 'dev'" },
+    { name = "flask-restful", marker = "extra == 'mlm'" },
+    { name = "mamba-ssm", marker = "extra == 'dev'", specifier = "~=2.2" },
+    { name = "megatron-energon", extras = ["av-decode"], marker = "extra == 'dev'", specifier = "~=6.0" },
+    { name = "multi-storage-client", marker = "extra == 'dev'", specifier = "~=0.27" },
+    { name = "numpy", specifier = "<2.0.0" },
+    { name = "nv-grouped-gemm", marker = "extra == 'dev'", specifier = "~=1.1" },
+    { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin' and extra == 'dev'", specifier = ">=0.33.0a0,<0.34.0" },
+    { name = "nvidia-resiliency-ext", marker = "extra == 'dev'", specifier = ">=0.4.0a0,<0.5.0" },
+    { name = "nvtx", marker = "extra == 'dev'", specifier = "~=0.2" },
+    { name = "nvtx", marker = "extra == 'lts'" },
+    { name = "onnxscript", marker = "extra == 'dev'" },
+    { name = "opentelemetry-api", marker = "extra == 'dev'", specifier = "~=1.33.1" },
+    { name = "packaging", specifier = ">=24.2" },
+    { name = "sentencepiece", marker = "extra == 'mlm'" },
+    { name = "setuptools", marker = "extra == 'dev'", specifier = "<80.0.0" },
+    { name = "setuptools", marker = "extra == 'lts'", specifier = "<80.0.0" },
+    { name = "tensorstore", marker = "extra == 'dev'", specifier = "~=0.1,!=0.1.46,!=0.1.72" },
+    { name = "tensorstore", marker = "extra == 'lts'", specifier = "!=0.1.46,!=0.1.72" },
+    { name = "tiktoken", marker = "extra == 'mlm'" },
+    { name = "torch" },
+    { name = "tqdm", marker = "extra == 'dev'" },
+    { name = "tqdm", marker = "extra == 'lts'" },
+    { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" },
+    { name = "transformers", marker = "extra == 'lts'" },
+    { name = "transformers", marker = "extra == 'mlm'" },
+    { name = "wandb", marker = "extra == 'mlm'" },
+    { name = "wget", marker = "extra == 'dev'" },
+    { name = "wget", marker = "extra == 'lts'" },
+    { name = "zarr", marker = "extra == 'lts'" },
+]
+provides-extras = ["mlm", "dev", "lts"]
+
+[package.metadata.requires-dev]
+build = [
+    { name = "cython", specifier = ">=3.0.0" },
+    { name = "hatchling" },
+    { name = "nvidia-mathdx" },
+    { name = "packaging", specifier = ">=24.2" },
+    { name = "pybind11" },
+    { name = "setuptools", specifier = "<80.0.0" },
+    { name = "torch" },
+]
+ci = [
+    { name = "pandas" },
+    { name = "python-gitlab" },
+    { name = "slack-sdk" },
+]
+docs = [
+    { name = "myst-parser" },
+    { name = "nvidia-sphinx-theme" },
+    { name = "sphinx" },
+    { name = "sphinx-autobuild" },
+    { name = "sphinx-autodoc2" },
+    { name = "sphinx-copybutton" },
+]
+flash-mla = [{ name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" }]
+linting = [
+    { name = "black", specifier = "==24.4.2" },
+    { name = "flake8", specifier = "==7.1.0" },
+    { name = "isort", specifier = "==5.13.2" },
+    { name = "pylint", specifier = "==3.2.6" },
+    { name = "ruff", specifier = "~=0.9.0" },
+]
+test = [
+    { name = "coverage" },
+    { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=01a9a8ba360f7b2908728ad0516e0ad9d936966d" },
+    { name = "nltk" },
+    { name = "pydantic" },
+    { name = "pygithub" },
+    { name = "pytest", specifier = "==8.3.5" },
+    { name = "pytest-asyncio" },
+    { name = "pytest-cov" },
+    { name = "pytest-mock" },
+    { name = "pytest-random-order" },
+    { name = "pyyaml" },
+    { name = "tensorboard" },
+    { name = "wrapt" },
+]
+
 [[package]]
 name = "megatron-energon"
 version = "6.0.1"
@@ -3213,8 +3360,7 @@ wheels = [
 
 [[package]]
 name = "nemo-automodel"
-version = "0.1.0rc0"
-source = { git = "https://github.com/NVIDIA-NeMo/Automodel.git?rev=17055ab4fe820ba0e4c13aed8fd2c810b6551bf7#17055ab4fe820ba0e4c13aed8fd2c810b6551bf7" }
+source = { directory = "3rdparty/Automodel" }
 dependencies = [
     { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
     { name = "datasets" },
@@ -3234,6 +3380,65 @@ dependencies = [
     { name = "wandb" },
 ]
 
+[package.metadata]
+requires-dist = [
+    { name = "backoff", marker = "extra == 'vlm'" },
+    { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = "==0.45.5" },
+    { name = "datasets", specifier = ">=4.0.0" },
+    { name = "diffusers" },
+    { name = "flash-attn", marker = "extra == 'fa'", specifier = "<=2.8.3" },
+    { name = "ftfy" },
+    { name = "imageio-ffmpeg" },
+    { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = ">=0.5.9" },
+    { name = "megatron-fsdp" },
+    { name = "mistral-common", extras = ["opencv"], marker = "extra == 'vlm'" },
+    { name = "mlflow" },
+    { name = "numba", marker = "extra == 'vlm'" },
+    { name = "numpy", marker = "extra == 'vlm'" },
+    { name = "opencv-python-headless", specifier = "==4.10.0.84" },
+    { name = "pillow", marker = "extra == 'vlm'" },
+    { name = "pybind11" },
+    { name = "pyyaml" },
+    { name = "qwen-omni-utils", marker = "extra == 'vlm'" },
+    { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
+    { name = "timm", marker = "extra == 'vlm'", specifier = "==1.0.16" },
+    { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" },
+    { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" },
+    { name = "torchao" },
+    { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
+    { name = "torchdata" },
+    { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'moe'", specifier = "==2.8.0" },
+    { name = "transformers", specifier = "<=4.57.1" },
+    { name = "wandb" },
+]
+provides-extras = ["vlm", "fa", "moe"]
+
+[package.metadata.requires-dev]
+build = [
+    { name = "setuptools" },
+    { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" },
+    { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" },
+]
+dev = [{ name = "cut-cross-entropy", git = "https://github.com/apple/ml-cross-entropy.git?rev=87a86ab" }]
+docs = [
+    { name = "myst-parser" },
+    { name = "nvidia-sphinx-theme" },
+    { name = "sphinx" },
+    { name = "sphinx-autobuild" },
+    { name = "sphinx-autodoc2" },
+    { name = "sphinx-copybutton" },
+]
+linting = [
+    { name = "import-linter", specifier = "~=2.4" },
+    { name = "pre-commit", specifier = ">=4.2.0" },
+    { name = "ruff", specifier = "~=0.9.0" },
+]
+test = [
+    { name = "coverage" },
+    { name = "peft" },
+    { name = "pytest" },
+]
+
 [[package]]
 name = "nemo-dfm"
 source = { editable = "." }
@@ -3308,7 +3513,7 @@ requires-dist = [
 ]
 
 [package.metadata.requires-dev]
-automodel = [{ name = "nemo-automodel", git = "https://github.com/NVIDIA-NeMo/Automodel.git?rev=17055ab4fe820ba0e4c13aed8fd2c810b6551bf7" }]
+automodel = [{ name = "nemo-automodel", directory = "3rdparty/Automodel" }]
 build = [
     { name = "cython", specifier = ">=3.0.0" },
     { name = "ninja" },
@@ -3333,7 +3538,7 @@ docs = [
     { name = "sphinx-autodoc2", specifier = ">=0.5.0" },
     { name = "sphinx-copybutton", specifier = ">=0.5.2" },
 ]
-megatronbridge = [{ name = "megatron-bridge", git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?rev=b245756e35943083a77aff6217fe60dd1704f6ca" }]
+megatronbridge = [{ name = "megatron-bridge", directory = "3rdparty/Megatron-Bridge" }]
 test = [
     { name = "coverage", specifier = ">=7.8.1" },
     { name = "flake8", specifier = ">=7.2.0" },

From a0c5367e1a8797c28cce1382c34cc771613ae90f Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 17 Nov 2025 10:26:25 -0800
Subject: [PATCH 33/34] fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 82933f29..5e403755 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -142,6 +142,7 @@ explicit = true
 [tool.uv.sources]
 nemo-automodel = { path = "3rdparty/Automodel" }
 megatron-bridge = { path = "3rdparty/Megatron-Bridge" }
+megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" }
 transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" }
 nvidia-resiliency-ext = { index = "pypi" }
 

From 7b108d1454e5f9f0f1b5b467070bd866a4f315df Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Date: Mon, 17 Nov 2025 10:49:48 -0800
Subject: [PATCH 34/34] update uv.lock

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 uv.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/uv.lock b/uv.lock
index ea6de63c..b6443987 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3483,7 +3483,7 @@ docs = [
     { name = "sphinx-autodoc2" },
     { name = "sphinx-copybutton" },
 ]
-megatronbridge = [
+megatron-bridge = [
     { name = "megatron-bridge" },
 ]
 test = [
@@ -3538,7 +3538,7 @@ docs = [
     { name = "sphinx-autodoc2", specifier = ">=0.5.0" },
     { name = "sphinx-copybutton", specifier = ">=0.5.2" },
 ]
-megatronbridge = [{ name = "megatron-bridge", directory = "3rdparty/Megatron-Bridge" }]
+megatron-bridge = [{ name = "megatron-bridge", directory = "3rdparty/Megatron-Bridge" }]
 test = [
     { name = "coverage", specifier = ">=7.8.1" },
     { name = "flake8", specifier = ">=7.2.0" },