diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge
index 4e4ce420..8e21f81a 160000
--- a/3rdparty/Megatron-Bridge
+++ b/3rdparty/Megatron-Bridge
@@ -1 +1 @@
-Subproject commit 4e4ce4203589466d0a5b846e12dd24fa74c57f2a
+Subproject commit 8e21f81ab961bdb0ad99a275074fe50aae15d2f9
diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
index d9e8c3ce..cb9e9d00 100644
--- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
+++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import logging
 import os
 from typing import Any, Dict, Iterable, Optional, Tuple
 
 import torch
 import torch.nn as nn
-from Automodel.distributed.dfm_parallelizer import WanParallelizationStrategy
-from diffusers import DiffusionPipeline
+from diffusers import DiffusionPipeline, WanPipeline
 from nemo_automodel.components.distributed import parallelizer
 from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager
 from nemo_automodel.shared.utils import dtype_from_str
 
+from dfm.src.automodel.distributed.dfm_parallelizer import WanParallelizationStrategy
+
 
 logger = logging.getLogger(__name__)
 
@@ -154,3 +156,71 @@ def from_pretrained(
                 parallel_module = manager.parallelize(comp_module)
                 setattr(pipe, comp_name, parallel_module)
         return pipe, created_managers
+
+
+class NeMoWanPipeline:
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        return NeMoAutoDiffusionPipeline.from_pretrained(*args, **kwargs)
+
+    @classmethod
+    def from_config(
+        cls,
+        model_id,
+        torch_dtype: torch.dtype = torch.bfloat16,
+        config: dict = None,
+        parallel_scheme: Optional[Dict[str, Dict[str, Any]]] = None,
+        device: Optional[torch.device] = None,
+        move_to_device: bool = True,
+        components_to_load: Optional[Iterable[str]] = None,
+    ):
+        # Load just the config
+        from diffusers import WanTransformer3DModel
+
+        if config is None:
+            transformer = WanTransformer3DModel.from_pretrained(
+                model_id,
+                subfolder="transformer",
+                torch_dtype=torch.bfloat16,
+            )
+
+            # Get config and reinitialize with random weights
+            config = copy.deepcopy(transformer.config)
+            del transformer
+
+        # Initialize with random weights
+        transformer = WanTransformer3DModel.from_config(config)
+
+        # Load pipeline with random transformer
+        pipe = WanPipeline.from_pretrained(
+            model_id,
+            transformer=transformer,
+            torch_dtype=torch_dtype,
+        )
+        # Decide device
+        dev = _choose_device(device)
+
+        # Move modules to device/dtype first (helps avoid initial OOM during sharding)
+        if move_to_device:
+            for name, module in _iter_pipeline_modules(pipe):
+                if not components_to_load or name in components_to_load:
+                    logger.info("[INFO] Moving module: %s to device/dtype", name)
+                    _move_module_to_device(module, dev, torch_dtype)
+
+        # Use per-component FSDP2Manager init-args to parallelize components
+        created_managers: Dict[str, FSDP2Manager] = {}
+        if parallel_scheme is not None:
+            assert torch.distributed.is_initialized(), "Expect distributed environment to be initialized"
+            _init_parallelizer()
+            for comp_name, comp_module in _iter_pipeline_modules(pipe):
+                manager_args = parallel_scheme.get(comp_name)
+                if manager_args is None:
+                    continue
+                manager = FSDP2Manager(**manager_args)
+                created_managers[comp_name] = manager
+                parallel_module = manager.parallelize(comp_module)
+                setattr(pipe, comp_name, parallel_module)
+        return pipe, created_managers
diff --git a/dfm/src/automodel/datasets/__init__.py b/dfm/src/automodel/datasets/__init__.py
index a3ef8358..051d4cd2 100644
--- a/dfm/src/automodel/datasets/__init__.py
+++ b/dfm/src/automodel/datasets/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from Automodel.datasets.wan21 import (
+from dfm.src.automodel.datasets.wan21 import (
     MetaFilesDataset,
     build_node_parallel_sampler,
     build_wan21_dataloader,
diff --git a/dfm/src/automodel/flow_matching/training_step_t2v.py b/dfm/src/automodel/flow_matching/training_step_t2v.py
index 0e7b9bc0..18cce361 100644
--- a/dfm/src/automodel/flow_matching/training_step_t2v.py
+++ b/dfm/src/automodel/flow_matching/training_step_t2v.py
@@ -19,7 +19,8 @@
 from typing import Dict, Tuple
 
 import torch
-from Automodel.flow_matching.time_shift_utils import (
+
+from dfm.src.automodel.flow_matching.time_shift_utils import (
     compute_density_for_timestep_sampling,
 )
 
@@ -28,8 +29,8 @@
 
 
 def step_fsdp_transformer_t2v(
-    pipe,
-    model_map: Dict,
+    scheduler,
+    model,
     batch,
     device,
     bf16,
@@ -40,6 +41,8 @@ def step_fsdp_transformer_t2v(
     logit_std: float = 1.0,
     flow_shift: float = 3.0,
     mix_uniform_ratio: float = 0.1,
+    sigma_min: float = 0.0,  # Default: no clamping (pretrain)
+    sigma_max: float = 1.0,  # Default: no clamping (pretrain)
     global_step: int = 0,
 ) -> Tuple[torch.Tensor, Dict]:
     """
@@ -74,7 +77,7 @@ def step_fsdp_transformer_t2v(
     # Flow Matching Timestep Sampling
     # ========================================================================
 
-    num_train_timesteps = pipe.scheduler.config.num_train_timesteps
+    num_train_timesteps = scheduler.config.num_train_timesteps
 
     if use_sigma_noise:
         use_uniform = torch.rand(1).item() < mix_uniform_ratio
@@ -96,12 +99,23 @@ def step_fsdp_transformer_t2v(
         # Apply flow shift: σ = shift/(shift + (1/u - 1))
         u_clamped = torch.clamp(u, min=1e-5)  # Avoid division by zero
         sigma = flow_shift / (flow_shift + (1.0 / u_clamped - 1.0))
-        sigma = torch.clamp(sigma, 0.0, 1.0)
+
+        # Clamp sigma (only if not full range [0,1])
+        # Pretrain uses [0, 1], finetune uses [0.02, 0.55]
+        if sigma_min > 0.0 or sigma_max < 1.0:
+            sigma = torch.clamp(sigma, sigma_min, sigma_max)
+        else:
+            sigma = torch.clamp(sigma, 0.0, 1.0)
 
     else:
         # Simple uniform without shift
         u = torch.rand(size=(batch_size,), device=device)
-        sigma = u
+
+        # Clamp sigma (only if not full range [0,1])
+        if sigma_min > 0.0 or sigma_max < 1.0:
+            sigma = torch.clamp(u, sigma_min, sigma_max)
+        else:
+            sigma = u
         sampling_method = "uniform_no_shift"
 
     # ========================================================================
@@ -186,10 +200,8 @@ def step_fsdp_transformer_t2v(
     # Forward Pass
     # ========================================================================
 
-    fsdp_model = model_map["transformer"]["fsdp_transformer"]
-
     try:
-        model_pred = fsdp_model(
+        model_pred = model(
             hidden_states=noisy_latents,
             timestep=timesteps_for_model,
             encoder_hidden_states=text_embeddings,
@@ -243,7 +255,7 @@ def step_fsdp_transformer_t2v(
         logger.info(f"[STEP {global_step}] LOSS DEBUG")
         logger.info("=" * 80)
         logger.info("[TARGET] Flow matching: v = ε - x_0")
-        logger.info(f"[PREDICTION] Scheduler type (inference only): {type(pipe.scheduler).__name__}")
+        logger.info(f"[PREDICTION] Scheduler type (inference only): {type(scheduler).__name__}")
         logger.info("")
         logger.info(f"[RANGES] Model pred: [{model_pred.min():.4f}, {model_pred.max():.4f}]")
         logger.info(f"[RANGES] Target (v): [{target.min():.4f}, {target.max():.4f}]")
diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/train.py
similarity index 88%
rename from dfm/src/automodel/recipes/finetune.py
rename to dfm/src/automodel/recipes/train.py
index 83c60d75..5a858fde 100644
--- a/dfm/src/automodel/recipes/finetune.py
+++ b/dfm/src/automodel/recipes/train.py
@@ -22,10 +22,6 @@
 import torch
 import torch.distributed as dist
 import wandb
-from Automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
-from Automodel.flow_matching.training_step_t2v import (
-    step_fsdp_transformer_t2v,
-)
 from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig
 from nemo_automodel.components.loggers.log_utils import setup_logging
 from nemo_automodel.components.loggers.wandb_utils import suppress_wandb_log_messages
@@ -36,68 +32,71 @@
 from torch.distributed.fsdp import MixedPrecisionPolicy
 from transformers.utils.hub import TRANSFORMERS_CACHE
 
+from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline
+from dfm.src.automodel.flow_matching.training_step_t2v import (
+    step_fsdp_transformer_t2v,
+)
+
 
 def build_model_and_optimizer(
     *,
     model_id: str,
+    finetune_mode: bool,
     learning_rate: float,
     device: torch.device,
-    bf16_dtype: torch.dtype,
+    dtype: torch.dtype,
     cpu_offload: bool = False,
-    tp_size: int = 1,
-    cp_size: int = 1,
-    pp_size: int = 1,
-    dp_size: Optional[int] = None,
-    dp_replicate_size: Optional[int] = None,
-    use_hf_tp_plan: bool = False,
+    fsdp_cfg: Dict[str, Any] = {},
     optimizer_cfg: Optional[Dict[str, Any]] = None,
-) -> tuple[NeMoAutoDiffusionPipeline, dict[str, Dict[str, Any]], torch.optim.Optimizer, Any]:
+) -> tuple[NeMoWanPipeline, dict[str, Dict[str, Any]], torch.optim.Optimizer, Any]:
     """Build the WAN 2.1 diffusion model, parallel scheme, and optimizer."""
 
-    logging.info("[INFO] Building NeMoAutoDiffusionPipeline with transformer parallel scheme...")
+    logging.info("[INFO] Building NeMoWanPipeline with transformer parallel scheme...")
 
     if not dist.is_initialized():
         logging.info("[WARN] torch.distributed not initialized; proceeding in single-process mode")
 
     world_size = dist.get_world_size() if dist.is_initialized() else 1
 
-    if dp_size is None:
-        denom = max(1, tp_size * cp_size * pp_size)
-        dp_size = max(1, world_size // denom)
+    if fsdp_cfg.get("dp_size", None) is None:
+        denom = max(1, fsdp_cfg.get("tp_size", 1) * fsdp_cfg.get("cp_size", 1) * fsdp_cfg.get("pp_size", 1))
+        fsdp_cfg.dp_size = max(1, world_size // denom)
 
     manager_args: Dict[str, Any] = {
-        "dp_size": dp_size,
-        "dp_replicate_size": dp_replicate_size,
-        "tp_size": tp_size,
-        "cp_size": cp_size,
-        "pp_size": pp_size,
+        "dp_size": fsdp_cfg.get("dp_size", None),
+        "dp_replicate_size": fsdp_cfg.get("dp_replicate_size", None),
+        "tp_size": fsdp_cfg.get("tp_size", 1),
+        "cp_size": fsdp_cfg.get("cp_size", 1),
+        "pp_size": fsdp_cfg.get("pp_size", 1),
         "backend": "nccl",
         "world_size": world_size,
-        "use_hf_tp_plan": use_hf_tp_plan,
+        "use_hf_tp_plan": fsdp_cfg.get("use_hf_tp_plan", False),
         "activation_checkpointing": True,
         "mp_policy": MixedPrecisionPolicy(
-            param_dtype=bf16_dtype,
-            reduce_dtype=bf16_dtype,
-            output_dtype=bf16_dtype,
+            param_dtype=dtype,
+            reduce_dtype=dtype,
+            output_dtype=dtype,
         ),
     }
 
     parallel_scheme = {"transformer": manager_args}
 
-    pipe, created_managers = NeMoAutoDiffusionPipeline.from_pretrained(
+    kwargs = {}
+    if finetune_mode:
+        kwargs["load_for_training"] = True
+        kwargs["low_cpu_mem_usage"] = True
+    init_fn = NeMoWanPipeline.from_pretrained if finetune_mode else NeMoWanPipeline.from_config
+
+    pipe, created_managers = init_fn(
         model_id,
-        torch_dtype=bf16_dtype,
+        torch_dtype=dtype,
         device=device,
         parallel_scheme=parallel_scheme,
-        load_for_training=True,
         components_to_load=["transformer"],
+        **kwargs,
     )
     fsdp2_manager = created_managers["transformer"]
-    transformer_module = getattr(pipe, "transformer", None)
-    if transformer_module is None:
-        raise RuntimeError("transformer not found in pipeline after parallelization")
-
-    model_map: dict[str, Dict[str, Any]] = {"transformer": {"fsdp_transformer": transformer_module}}
+    transformer_module = pipe.transformer
 
     trainable_params = [p for p in transformer_module.parameters() if p.requires_grad]
     if not trainable_params:
@@ -121,7 +120,7 @@ def build_model_and_optimizer(
 
     logging.info("[INFO] NeMoAutoDiffusion setup complete (pipeline + optimizer)")
 
-    return pipe, model_map, optimizer, fsdp2_manager.device_mesh
+    return pipe, optimizer, getattr(fsdp2_manager, "device_mesh", None)
 
 
 def build_lr_scheduler(
@@ -198,6 +197,8 @@ def setup(self):
         self.logit_std = fm_cfg.get("logit_std", 1.0)
         self.flow_shift = fm_cfg.get("flow_shift", 3.0)
         self.mix_uniform_ratio = fm_cfg.get("mix_uniform_ratio", 0.1)
+        self.sigma_min = fm_cfg.get("sigma_min", 0.0)
+        self.sigma_max = fm_cfg.get("sigma_max", 1.0)
 
         logging.info(f"[INFO] Flow matching: {'ENABLED' if self.use_sigma_noise else 'DISABLED'}")
         if self.use_sigma_noise:
@@ -205,29 +206,18 @@ def setup(self):
             logging.info(f"[INFO]   - Flow shift: {self.flow_shift}")
             logging.info(f"[INFO]   - Mix uniform ratio: {self.mix_uniform_ratio}")
 
-        tp_size = fsdp_cfg.get("tp_size", 1)
-        cp_size = fsdp_cfg.get("cp_size", 1)
-        pp_size = fsdp_cfg.get("pp_size", 1)
-        dp_size = fsdp_cfg.get("dp_size", None)
-        dp_replicate_size = fsdp_cfg.get("dp_replicate_size", None)
-        use_hf_tp_plan = fsdp_cfg.get("use_hf_tp_plan", False)
-
-        (self.pipe, self.model_map, self.optimizer, self.device_mesh) = build_model_and_optimizer(
+        (self.pipe, self.optimizer, self.device_mesh) = build_model_and_optimizer(
             model_id=self.model_id,
+            finetune_mode=self.cfg.get("model.mode", "finetune").lower() == "finetune",
             learning_rate=self.learning_rate,
             device=self.device,
-            bf16_dtype=self.bf16,
+            dtype=self.bf16,
             cpu_offload=self.cpu_offload,
-            tp_size=tp_size,
-            cp_size=cp_size,
-            pp_size=pp_size,
-            dp_size=dp_size,
-            dp_replicate_size=dp_replicate_size,
-            use_hf_tp_plan=use_hf_tp_plan,
+            fsdp_cfg=fsdp_cfg,
             optimizer_cfg=self.cfg.get("optim.optimizer", {}),
         )
 
-        self.model = self.model_map["transformer"]["fsdp_transformer"]
+        self.model = self.pipe.transformer
         self.peft_config = None
 
         batch_cfg = self.cfg.get("batch", {})
@@ -283,6 +273,9 @@ def setup(self):
             raise RuntimeError("Training dataloader is empty; cannot proceed with training")
 
         # Derive DP size consistent with model parallel config
+        tp_size = fsdp_cfg.get("tp_size", 1)
+        cp_size = fsdp_cfg.get("cp_size", 1)
+        pp_size = fsdp_cfg.get("pp_size", 1)
         denom = max(1, tp_size * cp_size * pp_size)
         self.dp_size = fsdp_cfg.get("dp_size", None)
         if self.dp_size is None:
@@ -356,8 +349,8 @@ def run_train_validation_loop(self):
                 for micro_batch in batch_group:
                     try:
                         loss, _ = step_fsdp_transformer_t2v(
-                            pipe=self.pipe,
-                            model_map=self.model_map,
+                            scheduler=self.pipe.scheduler,
+                            model=self.model,
                             batch=micro_batch,
                             device=self.device,
                             bf16=self.bf16,
@@ -367,6 +360,8 @@ def run_train_validation_loop(self):
                             logit_std=self.logit_std,
                             flow_shift=self.flow_shift,
                             mix_uniform_ratio=self.mix_uniform_ratio,
+                            sigma_min=self.sigma_min,
+                            sigma_max=self.sigma_max,
                             global_step=global_step,
                         )
                     except Exception as exc:
diff --git a/dfm/examples/automodel/finetune/finetune.py b/examples/automodel/finetune/finetune.py
similarity index 93%
rename from dfm/examples/automodel/finetune/finetune.py
rename to examples/automodel/finetune/finetune.py
index ae07451f..5c9da942 100644
--- a/dfm/examples/automodel/finetune/finetune.py
+++ b/examples/automodel/finetune/finetune.py
@@ -14,9 +14,10 @@
 
 from __future__ import annotations
 
-from Automodel.recipes.finetune import TrainWan21DiffusionRecipe
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
+from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe
+
 
 def main(default_config_path="/opt/DFM/dfm/examples/Automodel/finetune/wan2_1_t2v_flow.yaml"):
     cfg = parse_args_and_load_config(default_config_path)
diff --git a/dfm/examples/automodel/finetune/wan2_1_t2v_flow.yaml b/examples/automodel/finetune/wan2_1_t2v_flow.yaml
similarity index 93%
rename from dfm/examples/automodel/finetune/wan2_1_t2v_flow.yaml
rename to examples/automodel/finetune/wan2_1_t2v_flow.yaml
index 6b4e3528..cced17b9 100644
--- a/dfm/examples/automodel/finetune/wan2_1_t2v_flow.yaml
+++ b/examples/automodel/finetune/wan2_1_t2v_flow.yaml
@@ -14,7 +14,7 @@ model:
 
 data:
   dataloader:
-    _target_: Automodel.datasets.build_wan21_dataloader
+    _target_: dfm.src.automodel.datasets.build_wan21_dataloader
     meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/
     batch_size: 1
     num_workers: 2
diff --git a/dfm/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml b/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
similarity index 94%
rename from dfm/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
rename to examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
index 16d4793a..20539da5 100644
--- a/dfm/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
+++ b/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml
@@ -14,7 +14,7 @@ model:
 
 data:
   dataloader:
-    _target_: Automodel.datasets.build_wan21_dataloader
+    _target_: dfm.src.automodel.datasets.build_wan21_dataloader
     meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/
     batch_size: 1
     num_workers: 2
diff --git a/dfm/examples/automodel/generate/wan_generate.py b/examples/automodel/generate/wan_generate.py
similarity index 98%
rename from dfm/examples/automodel/generate/wan_generate.py
rename to examples/automodel/generate/wan_generate.py
index 2868ef9b..829ff308 100644
--- a/dfm/examples/automodel/generate/wan_generate.py
+++ b/examples/automodel/generate/wan_generate.py
@@ -18,12 +18,13 @@
 
 import torch
 import torch.distributed as dist
-from Automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
 from diffusers import AutoencoderKLWan
 from diffusers.utils import export_to_video
 from nemo_automodel.components.distributed.init_utils import initialize_distributed
 from nemo_automodel.components.loggers.log_utils import setup_logging
 
+from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Wan2.2 T2V FSDP2 generation")
diff --git a/examples/automodel/pretrain/pretrain.py b/examples/automodel/pretrain/pretrain.py
new file mode 100644
index 00000000..f7a38930
--- /dev/null
+++ b/examples/automodel/pretrain/pretrain.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
+
+from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe
+
+
+def main(default_config_path="/opt/DFM/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml"):
+    cfg = parse_args_and_load_config(default_config_path)
+    recipe = TrainWan21DiffusionRecipe(cfg)
+    recipe.setup()
+    recipe.run_train_validation_loop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
new file mode 100644
index 00000000..eeabb29a
--- /dev/null
+++ b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml
@@ -0,0 +1,65 @@
+seed: 42
+
+wandb:
+  project: wan-t2v-flow-matching-pretrain
+  mode: online
+  name: wan2_1_t2v_fm_pretrain
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+model:
+  pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+  mode: pretrain
+
+data:
+  dataloader:
+    _target_: dfm.src.automodel.datasets.build_wan21_dataloader
+    meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/
+    batch_size: 1
+    num_workers: 2
+    device: cpu
+
+batch:
+  batch_size_per_node: 8
+
+training:
+  num_epochs: 100
+
+optim:
+  learning_rate: 5e-5
+  optimizer:
+    weight_decay: 0.1
+    betas: [0.9, 0.95]
+  # "warmup_steps": 1000,
+  # "lr_min": 1e-5,
+
+
+flow_matching:
+  use_sigma_noise: true
+  timestep_sampling: uniform
+  logit_mean: 0.0
+  logit_std: 1.5
+  flow_shift: 2.5
+  mix_uniform_ratio: 0.2
+  # "sigma_min": 0.0,  # PRETRAIN: No clamping, full range
+  # "sigma_max": 1.0,  # PRETRAIN: No clamping, full range
+
+fsdp:
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  dp_replicate_size: 1
+  dp_size: none
+
+logging:
+  save_every: 1000
+  log_every: 2
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: /opt/DFM/wan_t2v_flow_outputs_base_recipe_fsdp_run_1/
+  model_save_format: torch_save
+  save_consolidated: false
+  restore_from: null
diff --git a/examples/dtensor/README.md b/examples/dtensor/README.md
deleted file mode 100644
index 709a9755..00000000
--- a/examples/dtensor/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# DTensor Models (NeMo Automodel)
-
-Examples using NeMo Automodel with distributed tensor parallelism.
diff --git a/examples/dtensor/configs/README.md b/examples/dtensor/configs/README.md
deleted file mode 100644
index c7df1772..00000000
--- a/examples/dtensor/configs/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Configs
-
-Configuration files for various Wan model versions.
diff --git a/examples/dtensor/scripts/README.md b/examples/dtensor/scripts/README.md
deleted file mode 100644
index 0a18e12b..00000000
--- a/examples/dtensor/scripts/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Scripts
-
-Training scripts for pretraining and finetuning.
diff --git a/pyproject.toml b/pyproject.toml
index 05a40a68..5e403755 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,6 +96,11 @@ automodel = [
     "nemo-automodel",
 ]
 megatron-bridge = ["megatron-bridge"]
+torch-cu124 = [
+    "torch",
+    "torchvision",
+    "torchaudio",
+]
 
 [tool.setuptools]
 packages = ["dfm"]
@@ -124,6 +129,11 @@ override-dependencies = [
     "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
 ]
 
+[[tool.uv.index]]
+name = "pytorch-cu124"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
+
 [[tool.uv.index]]
 name = "pypi"
 url = "https://pypi.org/simple"
diff --git a/uv.lock b/uv.lock
index 02355563..b6443987 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3494,6 +3494,11 @@ test = [
     { name = "pytest-mock" },
     { name = "pytest-runner" },
 ]
+torch-cu124 = [
+    { name = "torch", marker = "sys_platform == 'never'" },
+    { name = "torchaudio" },
+    { name = "torchvision", marker = "sys_platform == 'never'" },
+]
 
 [package.metadata]
 requires-dist = [
@@ -3542,6 +3547,11 @@ test = [
     { name = "pytest-mock", specifier = ">=3.14.0" },
     { name = "pytest-runner", specifier = ">=6.0.1" },
 ]
+torch-cu124 = [
+    { name = "torch" },
+    { name = "torchaudio" },
+    { name = "torchvision" },
+]
 
 [[package]]
 name = "networkx"
@@ -6239,7 +6249,7 @@ wheels = [
 
 [[package]]
 name = "torch"
-version = "2.9.1"
+version = "2.9.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
@@ -6262,6 +6272,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/a7/b888635fbb6ae951cffd41e1318966cbed96ec762b4999815ab68269e23f/torchao-0.14.1-py3-none-any.whl", hash = "sha256:c9896e14531817bc2ca6847b3fe71c42592ab80a43628b36668b2d6d6713fb5b", size = 1067611, upload-time = "2025-10-24T01:03:01.357Z" },
 ]
 
+[[package]]
+name = "torchaudio"
+version = "2.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "torch", marker = "sys_platform == 'never'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1c/87/7de58c8f4c1946ec4d9070354eae73d1e4f3d2426e5cfa45febbd8451ce5/torchaudio-2.9.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd13541197e035338bd43225b2067532056486d357c661e12d49ace4fc37f8bb", size = 805912, upload-time = "2025-11-12T15:25:47.857Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/1b/680ca01211a39746aedf54e475783f846fbd7961dfeb17bce7d123f931f0/torchaudio-2.9.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:31ec46b718b7caa0182221bfb42e2ad223947b752a996dcdc0388c34a678c966", size = 472829, upload-time = "2025-11-12T15:25:46.519Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/ee/d71e6d78d203d72f99c426fbbf2bcd801cf084d8f1891bb1f42c95bc5ec5/torchaudio-2.9.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ee11695b367f64638b4a0340cc9abb9be2173c6537bfe4ab286c6fbff68a1444", size = 2055454, upload-time = "2025-11-12T15:25:50.519Z" },
+    { url = "https://files.pythonhosted.org/packages/19/43/dcfadd58a21704835da8bcc43bbb999887a7a1f8965aab527bd50459272c/torchaudio-2.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:acffac66d0908baa4ef16ce5ce6d2a7bc10c2534fce719b146744f306ba08c4a", size = 663868, upload-time = "2025-11-12T15:25:51.755Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/6b/34e489fcb4adc4b571a166f2670cc7f156cbe3337867a892fade0a1a5224/torchaudio-2.9.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6e3f5943135701168d30196e2befd46290180cdbb9ee508b167730d51f43208f", size = 807349, upload-time = "2025-11-12T15:25:57.843Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/52/66830da8b638368bc0aef064f3307c88d28b526ff8e60a1fda681466b1b3/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d192cf3b1b677f6666dad60caf0ce7bab66965751570c694645dd905a6c61724", size = 474291, upload-time = "2025-11-12T15:25:45.21Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/6f/d8f1f36c9f63ddef78f00f8f8ddb9638128ceb5f6824c28bead5af48fc63/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8327e21f51dced2b6de3ac6a63f04bae9be9bc213e151f85c76164568c7ebc3d", size = 2058677, upload-time = "2025-11-12T15:25:53.09Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/ef/0ec42e783774bd1dda8bc2489e18b3e9c0a250384e0131cec9f35949f385/torchaudio-2.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:b41339a71b186bad238d94cfb68d4c202db0033088a7b824ce5484674bf67057", size = 664681, upload-time = "2025-11-12T15:25:59.08Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/83/71cbadd7b66753818b5775f2088bad4f721d581de276996df4968000a626/torchaudio-2.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7581ef170794c599aed55918e00d0acd9e5c9a0f19400c9a9a840955180365c5", size = 808098, upload-time = "2025-11-12T15:26:01.408Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/2d/32e8bec360459107f9b451cc1a5b6fdd5f1d3e653e65a111502084f21e3a/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:742f9d24db5f1f46d8c7e29c599fe55b866d92c4a8181fcb95eab12da225ceb0", size = 474604, upload-time = "2025-11-12T15:25:49.122Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/0d/b5af1d55ede1ca07769a2cf71256073d8958e2a5521fc734fc19f5343283/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4533fdafba73d7bcfcb5f1225b2cc8974a290ed0fe54c44638d6f440e91b8999", size = 2059899, upload-time = "2025-11-12T15:26:19.363Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/7c/df90eb0b337cbad59296ed91778e32be069330f5186256d4ce9ea603d324/torchaudio-2.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:923dccc67be4a6cbb45c3dcc2d69ee182bda75b09b69bc88cd3bcdfc739883a2", size = 665337, upload-time = "2025-11-12T15:26:07.407Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/1b/3321ad6379ac2d968064704e8d015c31ccae5d1ece070f87fb44b17d90e6/torchaudio-2.9.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bb69557484c92513a980027ec4cb314b0f43cf4442bbfd97440e66528dbad22d", size = 808136, upload-time = "2025-11-12T15:26:00.276Z" },
+    { url = "https://files.pythonhosted.org/packages/76/e2/fe55b3882157fd57aa131f5bcad90f0329be90827e1c0e0c482662ddef38/torchaudio-2.9.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ba2799ceec5e4373a0aa26df30d608f1eaaefd8ac4a7ae0c3446f63106f5b5a5", size = 474349, upload-time = "2025-11-12T15:26:02.78Z" },
+    { url = "https://files.pythonhosted.org/packages/74/d3/0b090c03cac5a20691507e0945589a696fb10402ccd2457eea47dbf8a71b/torchaudio-2.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:bc3c8e9a240bfad8bc61f769324a4f3ce5d60eec161369d457c595c35dbb10c7", size = 2060343, upload-time = "2025-11-12T15:26:03.88Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/db/2555cfd428f4bf09a4df1c6f9204d0acc217c46edb35776c16e7a2a9a1c9/torchaudio-2.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:13ee96ea9bbbc85e198cb671273af06f010e6981d7b912d001eef6bc74e23f4f", size = 665301, upload-time = "2025-11-12T15:26:04.952Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/58/e82d8b5f447abdddc950965f1395f36baef3602643dd069100c6369ba73e/torchaudio-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9290f6a6409deb1f9113d5aef97ec646eeee6410b6bcc57ab8b57066b54da7c1", size = 813456, upload-time = "2025-11-12T15:26:13.963Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/45/dd9ad6af9bb595095cd98028d270f933760968b92a3497282e31289ef3b4/torchaudio-2.9.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:eeae7ca60b64c4bfb78fbd104a089d072b151423d5d2f90da1da00787f03b800", size = 476577, upload-time = "2025-11-12T15:26:09.54Z" },
+    { url = "https://files.pythonhosted.org/packages/79/97/c49aeb01d8a9ced2b8215a38b69b8eafd1afe295a487a73b7030c6ff3396/torchaudio-2.9.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:5f445e896215e6f7bba497dc68aab1e6cb077ae0ab3a90095067f16df6a9bb98", size = 2062158, upload-time = "2025-11-12T15:26:10.487Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/70/30b2a0ecca2a0a5e6a8cee8952fdea3872854ea5bcd86fe3df369fdc2543/torchaudio-2.9.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c558ba70d548f7491245ed7a35310f6310d83fc7591f073ab5fed9fd38cef987", size = 669253, upload-time = "2025-11-12T15:26:06.285Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/38/0dabf362f946ab5773d3db3322718d652d70ad12a82f500d54c6c8b9cc88/torchaudio-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:69a582650279ee16ff9087f99b4234fe5d766e1bf7f0be352db5f46991854c1e", size = 810496, upload-time = "2025-11-12T15:26:11.515Z" },
+    { url = "https://files.pythonhosted.org/packages/05/1c/e05a32ee6868dc05463242db672f23dba5d042423fefcf294db4dac343a8/torchaudio-2.9.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:9c0d004f784c49078017f8217fdc901df0eb9724e50fb269b3a6c99b1d4eae75", size = 474566, upload-time = "2025-11-12T15:26:08.628Z" },
+    { url = "https://files.pythonhosted.org/packages/15/52/8cec1fe90f05b888f9060467e1eb8c27f9295b8729a83d443e3bd7c471d3/torchaudio-2.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d2743b28ff5538d5fdf2ff6657d392852ccdfe640ede46f566b2907ca32d8dca", size = 2060358, upload-time = "2025-11-12T15:26:12.885Z" },
+    { url = "https://files.pythonhosted.org/packages/04/73/6ba396813d714f895f86c82be61b590fbe14255ebe6866f5ea5916c075a3/torchaudio-2.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:234c7a9d4d0a6ed735cd37965baa9a89ca36bdbebece8a6a5ff7727acbb43026", size = 665039, upload-time = "2025-11-12T15:26:18.308Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/f6/237e00a04dea497a40a8567d024dfb39193abec3ca3695ad51919ad633d1/torchaudio-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e13cb38971ac259fc4e102282a3e48f6df5f0ab00eb785ca5155e3392d1e86f1", size = 813463, upload-time = "2025-11-12T15:26:16.261Z" },
+    { url = "https://files.pythonhosted.org/packages/57/99/5fcd46a80086030899badeb5a934fab337c88325b3f68c60faa0b672d4d2/torchaudio-2.9.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:35c96ed1011b50eaf17948da173b09450cdc5bb7f908687571adb4a4c072c05e", size = 476577, upload-time = "2025-11-12T15:26:17.355Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/4c/bc428f71d5ef728fba2ecb151a3a6d187e6f0b9446b76e4f87e46d2206a3/torchaudio-2.9.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:c220c4acf9914cce2dc81c3624d7c84008ef436dc31bcbb89e8f4416d3615a34", size = 2062170, upload-time = "2025-11-12T15:26:20.837Z" },
+    { url = "https://files.pythonhosted.org/packages/07/0e/be41f412e1225bdbd9b7fd7f41a20f070c707f5274b82542eeccf6dc2b79/torchaudio-2.9.1-cp314-cp314t-win_amd64.whl", hash = "sha256:cfd12934c7b54b41d4c79dfd26fbfe88fafa9cc5cc77c074e953bb7018d9322c", size = 669265, upload-time = "2025-11-12T15:26:14.976Z" },
+]
+
 [[package]]
 name = "torchdata"
 version = "0.11.0"
@@ -6291,7 +6339,7 @@ wheels = [
 
 [[package]]
 name = "torchvision"
-version = "0.24.1"
+version = "0.24.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },