From 2d486193985ff6da14638a6debd6d77fbf43f226 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Tue, 11 Nov 2025 19:35:23 -0800 Subject: [PATCH 01/34] init Signed-off-by: Alexandros Koumparoulis --- dfm/examples/Automodel/pretrain/pretrain.py | 29 +++++++++ .../Automodel/pretrain/wan2_1_t2v_flow.yaml | 64 +++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 dfm/examples/Automodel/pretrain/pretrain.py create mode 100644 dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml diff --git a/dfm/examples/Automodel/pretrain/pretrain.py b/dfm/examples/Automodel/pretrain/pretrain.py new file mode 100644 index 00000000..0d8b2a79 --- /dev/null +++ b/dfm/examples/Automodel/pretrain/pretrain.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from Automodel.recipes.finetune import TrainWan21DiffusionRecipe +from nemo_automodel.components.config._arg_parser import parse_args_and_load_config + + +def main(default_config_path="/opt/DFM/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml"): + cfg = parse_args_and_load_config(default_config_path) + recipe = TrainWan21DiffusionRecipe(cfg) + recipe.setup() + recipe.run_train_validation_loop() + + +if __name__ == "__main__": + main() diff --git a/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml b/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml new file mode 100644 index 00000000..43f4075b --- /dev/null +++ b/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml @@ -0,0 +1,64 @@ +seed: 42 + +wandb: + project: wan-t2v-flow-matching-pretrain + mode: online + name: wan2_1_t2v_fm_pretrain + +dist_env: + backend: nccl + timeout_minutes: 30 + +model: + pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers + +data: + dataloader: + _target_: Automodel.datasets.build_wan21_dataloader + meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/ + batch_size: 1 + num_workers: 2 + device: cpu + +batch: + batch_size_per_node: 8 + +training: + num_epochs: 100 + +optim: + learning_rate: 5e-5 + optimizer: + weight_decay: 0.1 + betas: [0.9, 0.95] + # "warmup_steps": 1000, + # "lr_min": 1e-5, + + +flow_matching: + use_sigma_noise: true + timestep_sampling: uniform + logit_mean: 0.0 + logit_std: 1.5 + flow_shift: 2.5 + mix_uniform_ratio: 0.2 + # "sigma_min": 0.0, # PRETRAIN: No clamping, full range + # "sigma_max": 1.0, # PRETRAIN: No clamping, full range + +fsdp: + tp_size: 1 + cp_size: 1 + pp_size: 1 + dp_replicate_size: 1 + dp_size: 8 + +logging: + save_every: 1000 + log_every: 2 + +checkpoint: + enabled: true + checkpoint_dir: /opt/DFM/wan_t2v_flow_outputs_base_recipe_fsdp_run_1/ + model_save_format: torch_save + save_consolidated: false + restore_from: null \ No newline at end of file From eb61496fdc171ea38c959b88010f2f6771a5c6f0 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 10:19:13 -0800 Subject: [PATCH 02/34] add sigma_min/amx Signed-off-by: Alexandros Koumparoulis --- .../flow_matching/training_step_t2v.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/dfm/src/automodel/flow_matching/training_step_t2v.py b/dfm/src/automodel/flow_matching/training_step_t2v.py index 0e7b9bc0..1d75c9e8 100644 --- a/dfm/src/automodel/flow_matching/training_step_t2v.py +++ b/dfm/src/automodel/flow_matching/training_step_t2v.py @@ -40,6 +40,8 @@ def step_fsdp_transformer_t2v( logit_std: float = 1.0, flow_shift: float = 3.0, mix_uniform_ratio: float = 0.1, + sigma_min: float = 0.0, # Default: no clamping (pretrain) + sigma_max: float = 1.0, # Default: no clamping (pretrain) global_step: int = 0, ) -> Tuple[torch.Tensor, Dict]: """ @@ -96,14 +98,26 @@ def step_fsdp_transformer_t2v( # Apply flow shift: σ = shift/(shift + (1/u - 1)) u_clamped = torch.clamp(u, min=1e-5) # Avoid division by zero sigma = flow_shift / (flow_shift + (1.0 / u_clamped - 1.0)) - sigma = torch.clamp(sigma, 0.0, 1.0) + + # Clamp sigma (only if not full range [0,1]) + # Pretrain uses [0, 1], finetune uses [0.02, 0.55] + if sigma_min > 0.0 or sigma_max < 1.0: + sigma = torch.clamp(sigma, sigma_min, sigma_max) + else: + sigma = torch.clamp(sigma, 0.0, 1.0) else: # Simple uniform without shift u = torch.rand(size=(batch_size,), device=device) - sigma = u + + # Clamp sigma (only if not full range [0,1]) + if sigma_min > 0.0 or sigma_max < 1.0: + sigma = torch.clamp(u, sigma_min, sigma_max) + else: + sigma = u sampling_method = "uniform_no_shift" + # ======================================================================== # Manual Flow Matching Noise Addition # ======================================================================== From 44a1cb6f1c1a1f785a4c7566aa48bac2854414a9 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 10:20:50 -0800 Subject: [PATCH 03/34] add sigma_min/max Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/recipes/finetune.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/finetune.py index 83c60d75..9684e7ae 100644 --- a/dfm/src/automodel/recipes/finetune.py +++ b/dfm/src/automodel/recipes/finetune.py @@ -198,6 +198,8 @@ def setup(self): self.logit_std = fm_cfg.get("logit_std", 1.0) self.flow_shift = fm_cfg.get("flow_shift", 3.0) self.mix_uniform_ratio = fm_cfg.get("mix_uniform_ratio", 0.1) + self.sigma_min = fm_cfg.get("sigma_min", 0.0) + self.sigma_max = fm_cfg.get("sigma_max", 1.0) logging.info(f"[INFO] Flow matching: {'ENABLED' if self.use_sigma_noise else 'DISABLED'}") if self.use_sigma_noise: @@ -367,6 +369,8 @@ def run_train_validation_loop(self): logit_std=self.logit_std, flow_shift=self.flow_shift, mix_uniform_ratio=self.mix_uniform_ratio, + sigma_min=self.sigma_min, + sigma_max=self.sigma_max, global_step=global_step, ) except Exception as exc: From 275ac71a64beef29a3d536f99dfbc0cf44180f9a Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 10:40:54 -0800 Subject: [PATCH 04/34] rename fientune.py to train.py Signed-off-by: Alexandros Koumparoulis --- dfm/examples/Automodel/pretrain/pretrain.py | 2 +- dfm/examples/automodel/finetune/finetune.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dfm/examples/Automodel/pretrain/pretrain.py b/dfm/examples/Automodel/pretrain/pretrain.py index 0d8b2a79..f1038198 100644 --- a/dfm/examples/Automodel/pretrain/pretrain.py +++ b/dfm/examples/Automodel/pretrain/pretrain.py @@ -14,7 +14,7 @@ from __future__ import annotations -from Automodel.recipes.finetune import TrainWan21DiffusionRecipe +from Automodel.recipes.train import TrainWan21DiffusionRecipe from nemo_automodel.components.config._arg_parser import parse_args_and_load_config diff --git a/dfm/examples/automodel/finetune/finetune.py b/dfm/examples/automodel/finetune/finetune.py index ae07451f..7d77162c 100644 --- a/dfm/examples/automodel/finetune/finetune.py +++ b/dfm/examples/automodel/finetune/finetune.py @@ -14,7 +14,7 @@ from __future__ import annotations -from Automodel.recipes.finetune import TrainWan21DiffusionRecipe +from Automodel.recipes.train import TrainWan21DiffusionRecipe from nemo_automodel.components.config._arg_parser import parse_args_and_load_config From 8490de5306e57a1465893198e1cea61626bd61a9 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 11:08:51 -0800 Subject: [PATCH 05/34] add from_config Signed-off-by: Alexandros Koumparoulis --- .../_diffusers/auto_diffusion_pipeline.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py index d9e8c3ce..22a1edd3 100644 --- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py +++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py @@ -154,3 +154,60 @@ def from_pretrained( parallel_module = manager.parallelize(comp_module) setattr(pipe, comp_name, parallel_module) return pipe, created_managers + + @classmethod + def from_config( + cls, + pretrained_model_name_or_path: str, + *model_args, + parallel_scheme: Optional[Dict[str, Dict[str, Any]]] = None, + device: Optional[torch.device] = None, + torch_dtype: Any = "auto", + move_to_device: bool = True, + load_for_training: bool = False, + components_to_load: Optional[Iterable[str]] = None, + **kwargs, + ) -> tuple[DiffusionPipeline, Dict[str, FSDP2Manager]]: + config = WanTransformer3DModel.from_pretrained( + pretrained_model_name_or_path, + subfolder="transformer", + torch_dtype=torch_dtype, + **kwargs, + ) + pipe: DiffusionPipeline = DiffusionPipeline.from_config( + config, + *model_args, + torch_dtype=torch_dtype, + **kwargs, + ) + # Decide device + dev = _choose_device(device) + + # Move modules to device/dtype first (helps avoid initial OOM during sharding) + if move_to_device: + for name, module in _iter_pipeline_modules(pipe): + if not components_to_load or name in components_to_load: + logger.info("[INFO] Moving module: %s to device/dtype", name) + _move_module_to_device(module, dev, torch_dtype) + + # If loading for training, ensure the target module parameters are trainable + if load_for_training: + for name, module in _iter_pipeline_modules(pipe): + if not components_to_load or name in components_to_load: + logger.info("[INFO] Ensuring params trainable: %s", name) + _ensure_params_trainable(module, module_name=name) + + # Use per-component FSDP2Manager init-args to parallelize components + created_managers: Dict[str, FSDP2Manager] = {} + if parallel_scheme is not None: + assert torch.distributed.is_initialized(), "Expect distributed environment to be initialized" + _init_parallelizer() + for comp_name, comp_module in _iter_pipeline_modules(pipe): + manager_args = parallel_scheme.get(comp_name) + if manager_args is None: + continue + manager = FSDP2Manager(**manager_args) + created_managers[comp_name] = manager + parallel_module = manager.parallelize(comp_module) + setattr(pipe, comp_name, parallel_module) + return pipe, created_managers From 1d1ca1dddfd6e4149bb03892ea25190c5bf83d4e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 19:56:51 -0800 Subject: [PATCH 06/34] pass scheduler and model Signed-off-by: Alexandros Koumparoulis --- .../automodel/flow_matching/training_step_t2v.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/dfm/src/automodel/flow_matching/training_step_t2v.py b/dfm/src/automodel/flow_matching/training_step_t2v.py index 1d75c9e8..dc80ce74 100644 --- a/dfm/src/automodel/flow_matching/training_step_t2v.py +++ b/dfm/src/automodel/flow_matching/training_step_t2v.py @@ -28,8 +28,8 @@ def step_fsdp_transformer_t2v( - pipe, - model_map: Dict, + scheduler, + model, batch, device, bf16, @@ -76,7 +76,7 @@ def step_fsdp_transformer_t2v( # Flow Matching Timestep Sampling # ======================================================================== - num_train_timesteps = pipe.scheduler.config.num_train_timesteps + num_train_timesteps = scheduler.config.num_train_timesteps if use_sigma_noise: use_uniform = torch.rand(1).item() < mix_uniform_ratio @@ -117,7 +117,6 @@ def step_fsdp_transformer_t2v( sigma = u sampling_method = "uniform_no_shift" - # ======================================================================== # Manual Flow Matching Noise Addition # ======================================================================== @@ -200,10 +199,8 @@ def step_fsdp_transformer_t2v( # Forward Pass # ======================================================================== - fsdp_model = model_map["transformer"]["fsdp_transformer"] - try: - model_pred = fsdp_model( + model_pred = model( hidden_states=noisy_latents, timestep=timesteps_for_model, encoder_hidden_states=text_embeddings, @@ -257,7 +254,7 @@ def step_fsdp_transformer_t2v( logger.info(f"[STEP {global_step}] LOSS DEBUG") logger.info("=" * 80) logger.info("[TARGET] Flow matching: v = ε - x_0") - logger.info(f"[PREDICTION] Scheduler type (inference only): {type(pipe.scheduler).__name__}") + logger.info(f"[PREDICTION] Scheduler type (inference only): {type(scheduler).__name__}") logger.info("") logger.info(f"[RANGES] Model pred: [{model_pred.min():.4f}, {model_pred.max():.4f}]") logger.info(f"[RANGES] Target (v): [{target.min():.4f}, {target.max():.4f}]") From 16ab73eede8e018cddc1add66ed0df41db4923ff Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 19:57:32 -0800 Subject: [PATCH 07/34] update param Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/recipes/finetune.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/finetune.py index 9684e7ae..88603e58 100644 --- a/dfm/src/automodel/recipes/finetune.py +++ b/dfm/src/automodel/recipes/finetune.py @@ -22,7 +22,7 @@ import torch import torch.distributed as dist import wandb -from Automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline +from Automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline from Automodel.flow_matching.training_step_t2v import ( step_fsdp_transformer_t2v, ) @@ -51,10 +51,10 @@ def build_model_and_optimizer( dp_replicate_size: Optional[int] = None, use_hf_tp_plan: bool = False, optimizer_cfg: Optional[Dict[str, Any]] = None, -) -> tuple[NeMoAutoDiffusionPipeline, dict[str, Dict[str, Any]], torch.optim.Optimizer, Any]: +) -> tuple[NeMoWanPipeline, dict[str, Dict[str, Any]], torch.optim.Optimizer, Any]: """Build the WAN 2.1 diffusion model, parallel scheme, and optimizer.""" - logging.info("[INFO] Building NeMoAutoDiffusionPipeline with transformer parallel scheme...") + logging.info("[INFO] Building NeMoWanPipeline with transformer parallel scheme...") if not dist.is_initialized(): logging.info("[WARN] torch.distributed not initialized; proceeding in single-process mode") @@ -84,7 +84,7 @@ def build_model_and_optimizer( parallel_scheme = {"transformer": manager_args} - pipe, created_managers = NeMoAutoDiffusionPipeline.from_pretrained( + pipe, created_managers = NeMoWanPipeline.from_pretrained( model_id, torch_dtype=bf16_dtype, device=device, @@ -93,11 +93,7 @@ def build_model_and_optimizer( components_to_load=["transformer"], ) fsdp2_manager = created_managers["transformer"] - transformer_module = getattr(pipe, "transformer", None) - if transformer_module is None: - raise RuntimeError("transformer not found in pipeline after parallelization") - - model_map: dict[str, Dict[str, Any]] = {"transformer": {"fsdp_transformer": transformer_module}} + transformer_module = pipe.transformer trainable_params = [p for p in transformer_module.parameters() if p.requires_grad] if not trainable_params: @@ -121,7 +117,7 @@ def build_model_and_optimizer( logging.info("[INFO] NeMoAutoDiffusion setup complete (pipeline + optimizer)") - return pipe, model_map, optimizer, fsdp2_manager.device_mesh + return pipe, optimizer, fsdp2_manager.device_mesh def build_lr_scheduler( @@ -214,7 +210,7 @@ def setup(self): dp_replicate_size = fsdp_cfg.get("dp_replicate_size", None) use_hf_tp_plan = fsdp_cfg.get("use_hf_tp_plan", False) - (self.pipe, self.model_map, self.optimizer, self.device_mesh) = build_model_and_optimizer( + (self.pipe, self.optimizer, self.device_mesh) = build_model_and_optimizer( model_id=self.model_id, learning_rate=self.learning_rate, device=self.device, @@ -229,7 +225,7 @@ def setup(self): optimizer_cfg=self.cfg.get("optim.optimizer", {}), ) - self.model = self.model_map["transformer"]["fsdp_transformer"] + self.model = self.pipe.transformer self.peft_config = None batch_cfg = self.cfg.get("batch", {}) @@ -358,8 +354,8 @@ def run_train_validation_loop(self): for micro_batch in batch_group: try: loss, _ = step_fsdp_transformer_t2v( - pipe=self.pipe, - model_map=self.model_map, + scheduler=self.pipe.scheduler, + model=self.model, batch=micro_batch, device=self.device, bf16=self.bf16, From 655c6e177a1aad33df24159daa1fe12df7c23d68 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 19:58:46 -0800 Subject: [PATCH 08/34] introduce NeMoWanPipeline Signed-off-by: Alexandros Koumparoulis --- .../_diffusers/auto_diffusion_pipeline.py | 60 +++++++++++-------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py index 22a1edd3..b7eb979a 100644 --- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py +++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import logging import os from typing import Any, Dict, Iterable, Optional, Tuple @@ -19,7 +20,7 @@ import torch import torch.nn as nn from Automodel.distributed.dfm_parallelizer import WanParallelizationStrategy -from diffusers import DiffusionPipeline +from diffusers import DiffusionPipeline, WanPipeline from nemo_automodel.components.distributed import parallelizer from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager from nemo_automodel.shared.utils import dtype_from_str @@ -155,30 +156,48 @@ def from_pretrained( setattr(pipe, comp_name, parallel_module) return pipe, created_managers + +class NeMoWanPipeline(WanPipeline): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + return NeMoAutoDiffusionPipeline.from_pretrained(*args, **kwargs) + @classmethod def from_config( cls, - pretrained_model_name_or_path: str, - *model_args, + model_id, + torch_dtype: torch.dtype = torch.bfloat16, + config: dict = None, parallel_scheme: Optional[Dict[str, Dict[str, Any]]] = None, device: Optional[torch.device] = None, - torch_dtype: Any = "auto", move_to_device: bool = True, - load_for_training: bool = False, components_to_load: Optional[Iterable[str]] = None, - **kwargs, - ) -> tuple[DiffusionPipeline, Dict[str, FSDP2Manager]]: - config = WanTransformer3DModel.from_pretrained( - pretrained_model_name_or_path, - subfolder="transformer", + ): + # Load just the config + from diffusers import WanTransformer3DModel + + if model_id is not None: + transformer = WanTransformer3DModel.from_pretrained( + model_id, + subfolder="transformer", + torch_dtype=torch.bfloat16, + ) + + # Get config and reinitialize with random weights + config = copy.deepcopy(transformer.config) + del transformer + + # Initialize with random weights + transformer = WanTransformer3DModel.from_config(config) + + # Load pipeline with random transformer + pipe = WanPipeline.from_pretrained( + model_id, + transformer=transformer, torch_dtype=torch_dtype, - **kwargs, - ) - pipe: DiffusionPipeline = DiffusionPipeline.from_config( - config, - *model_args, - torch_dtype=torch_dtype, - **kwargs, ) # Decide device dev = _choose_device(device) @@ -190,13 +209,6 @@ def from_config( logger.info("[INFO] Moving module: %s to device/dtype", name) _move_module_to_device(module, dev, torch_dtype) - # If loading for training, ensure the target module parameters are trainable - if load_for_training: - for name, module in _iter_pipeline_modules(pipe): - if not components_to_load or name in components_to_load: - logger.info("[INFO] Ensuring params trainable: %s", name) - _ensure_params_trainable(module, module_name=name) - # Use per-component FSDP2Manager init-args to parallelize components created_managers: Dict[str, FSDP2Manager] = {} if parallel_scheme is not None: From 56bd770953e39701f548624af6deac2ff3c456b5 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 20:04:27 -0800 Subject: [PATCH 09/34] add mode Signed-off-by: Alexandros Koumparoulis --- dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml b/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml index 43f4075b..0c44244f 100644 --- a/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml +++ b/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml @@ -11,6 +11,7 @@ dist_env: model: pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers + mode: pretrain data: dataloader: From 72f4187257eb110ba6eb160f01b50a03358a89fa Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 20:05:16 -0800 Subject: [PATCH 10/34] update build_model_and_optimizer Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/recipes/finetune.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/finetune.py index 88603e58..7777b3d2 100644 --- a/dfm/src/automodel/recipes/finetune.py +++ b/dfm/src/automodel/recipes/finetune.py @@ -40,6 +40,7 @@ def build_model_and_optimizer( *, model_id: str, + finetune_mode: bool, learning_rate: float, device: torch.device, bf16_dtype: torch.dtype, @@ -84,7 +85,9 @@ def build_model_and_optimizer( parallel_scheme = {"transformer": manager_args} - pipe, created_managers = NeMoWanPipeline.from_pretrained( + init_fn = NeMoWanPipeline.from_pretrained if finetune_mode else NeMoWanPipeline.from_config + + pipe, created_managers = init_fn( model_id, torch_dtype=bf16_dtype, device=device, @@ -212,6 +215,7 @@ def setup(self): (self.pipe, self.optimizer, self.device_mesh) = build_model_and_optimizer( model_id=self.model_id, + finetune_mode=self.cfg.get("model.mode", "finetune").lower() == "finetune", learning_rate=self.learning_rate, device=self.device, bf16_dtype=self.bf16, From f06e7992092160a1f3c92e5a79f4a60e7f547c6e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 22:59:11 -0800 Subject: [PATCH 11/34] update Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/recipes/finetune.py | 57 +++++++++++---------------- 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/finetune.py index 7777b3d2..5490e595 100644 --- a/dfm/src/automodel/recipes/finetune.py +++ b/dfm/src/automodel/recipes/finetune.py @@ -43,14 +43,9 @@ def build_model_and_optimizer( finetune_mode: bool, learning_rate: float, device: torch.device, - bf16_dtype: torch.dtype, + dtype: torch.dtype, cpu_offload: bool = False, - tp_size: int = 1, - cp_size: int = 1, - pp_size: int = 1, - dp_size: Optional[int] = None, - dp_replicate_size: Optional[int] = None, - use_hf_tp_plan: bool = False, + fsdp_cfg: Dict[str, Any] = {}, optimizer_cfg: Optional[Dict[str, Any]] = None, ) -> tuple[NeMoWanPipeline, dict[str, Dict[str, Any]], torch.optim.Optimizer, Any]: """Build the WAN 2.1 diffusion model, parallel scheme, and optimizer.""" @@ -62,38 +57,42 @@ def build_model_and_optimizer( world_size = dist.get_world_size() if dist.is_initialized() else 1 - if dp_size is None: - denom = max(1, tp_size * cp_size * pp_size) - dp_size = max(1, world_size // denom) + if fsdp_cfg.get("dp_size", None) is None: + denom = max(1, fsdp_cfg.get("tp_size", 1) * fsdp_cfg.get("cp_size", 1) * fsdp_cfg.get("pp_size", 1)) + fsdp_cfg.dp_size = max(1, world_size // denom) manager_args: Dict[str, Any] = { - "dp_size": dp_size, - "dp_replicate_size": dp_replicate_size, - "tp_size": tp_size, - "cp_size": cp_size, - "pp_size": pp_size, + "dp_size": fsdp_cfg.get("dp_size", None), + "dp_replicate_size": fsdp_cfg.get("dp_replicate_size", None), + "tp_size": fsdp_cfg.get("tp_size", 1), + "cp_size": fsdp_cfg.get("cp_size", 1), + "pp_size": fsdp_cfg.get("pp_size", 1), "backend": "nccl", "world_size": world_size, - "use_hf_tp_plan": use_hf_tp_plan, + "use_hf_tp_plan": fsdp_cfg.get("use_hf_tp_plan", False), "activation_checkpointing": True, "mp_policy": MixedPrecisionPolicy( - param_dtype=bf16_dtype, - reduce_dtype=bf16_dtype, - output_dtype=bf16_dtype, + param_dtype=dtype, + reduce_dtype=dtype, + output_dtype=dtype, ), } parallel_scheme = {"transformer": manager_args} + kwargs = {} + if finetune_mode: + kwargs["load_for_training"] = True + kwargs["low_cpu_mem_usage"] = True init_fn = NeMoWanPipeline.from_pretrained if finetune_mode else NeMoWanPipeline.from_config pipe, created_managers = init_fn( model_id, - torch_dtype=bf16_dtype, + torch_dtype=dtype, device=device, parallel_scheme=parallel_scheme, - load_for_training=True, components_to_load=["transformer"], + **kwargs, ) fsdp2_manager = created_managers["transformer"] transformer_module = pipe.transformer @@ -206,26 +205,14 @@ def setup(self): logging.info(f"[INFO] - Flow shift: {self.flow_shift}") logging.info(f"[INFO] - Mix uniform ratio: {self.mix_uniform_ratio}") - tp_size = fsdp_cfg.get("tp_size", 1) - cp_size = fsdp_cfg.get("cp_size", 1) - pp_size = fsdp_cfg.get("pp_size", 1) - dp_size = fsdp_cfg.get("dp_size", None) - dp_replicate_size = fsdp_cfg.get("dp_replicate_size", None) - use_hf_tp_plan = fsdp_cfg.get("use_hf_tp_plan", False) - (self.pipe, self.optimizer, self.device_mesh) = build_model_and_optimizer( model_id=self.model_id, finetune_mode=self.cfg.get("model.mode", "finetune").lower() == "finetune", learning_rate=self.learning_rate, device=self.device, - bf16_dtype=self.bf16, + dtype=self.bf16, cpu_offload=self.cpu_offload, - tp_size=tp_size, - cp_size=cp_size, - pp_size=pp_size, - dp_size=dp_size, - dp_replicate_size=dp_replicate_size, - use_hf_tp_plan=use_hf_tp_plan, + fsdp_cfg=fsdp_cfg, optimizer_cfg=self.cfg.get("optim.optimizer", {}), ) From bd02816de91c8219f62cda799d8d995b89f11321 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 22:59:41 -0800 Subject: [PATCH 12/34] update NeMoWanPipeline Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py index b7eb979a..2f6351b7 100644 --- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py +++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py @@ -157,7 +157,7 @@ def from_pretrained( return pipe, created_managers -class NeMoWanPipeline(WanPipeline): +class NeMoWanPipeline: def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -179,7 +179,7 @@ def from_config( # Load just the config from diffusers import WanTransformer3DModel - if model_id is not None: + if config is None: transformer = WanTransformer3DModel.from_pretrained( model_id, subfolder="transformer", From 09baf0f2931a9bd11925f610e38dc71f2af9ea74 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 23:03:56 -0800 Subject: [PATCH 13/34] rename Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/recipes/{finetune.py => train.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dfm/src/automodel/recipes/{finetune.py => train.py} (100%) diff --git a/dfm/src/automodel/recipes/finetune.py b/dfm/src/automodel/recipes/train.py similarity index 100% rename from dfm/src/automodel/recipes/finetune.py rename to dfm/src/automodel/recipes/train.py From 02ab4fc0caf67ec9feb347114ee6a7b31c1e8756 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 23:05:14 -0800 Subject: [PATCH 14/34] move examples Signed-off-by: Alexandros Koumparoulis --- dfm/examples/Automodel/pretrain/pretrain.py | 2 +- {dfm/examples => examples}/automodel/finetune/finetune.py | 2 +- .../automodel/finetune/wan2_1_t2v_flow.yaml | 0 .../automodel/finetune/wan2_1_t2v_flow_multinode.yaml | 0 {dfm/examples => examples}/automodel/generate/wan_generate.py | 0 examples/dtensor/README.md | 3 --- examples/dtensor/configs/README.md | 3 --- examples/dtensor/scripts/README.md | 3 --- 8 files changed, 2 insertions(+), 11 deletions(-) rename {dfm/examples => examples}/automodel/finetune/finetune.py (94%) rename {dfm/examples => examples}/automodel/finetune/wan2_1_t2v_flow.yaml (100%) rename {dfm/examples => examples}/automodel/finetune/wan2_1_t2v_flow_multinode.yaml (100%) rename {dfm/examples => examples}/automodel/generate/wan_generate.py (100%) delete mode 100644 examples/dtensor/README.md delete mode 100644 examples/dtensor/configs/README.md delete mode 100644 examples/dtensor/scripts/README.md diff --git a/dfm/examples/Automodel/pretrain/pretrain.py b/dfm/examples/Automodel/pretrain/pretrain.py index f1038198..ec054d27 100644 --- a/dfm/examples/Automodel/pretrain/pretrain.py +++ b/dfm/examples/Automodel/pretrain/pretrain.py @@ -14,7 +14,7 @@ from __future__ import annotations -from Automodel.recipes.train import TrainWan21DiffusionRecipe +from automodel.recipes.train import TrainWan21DiffusionRecipe from nemo_automodel.components.config._arg_parser import parse_args_and_load_config diff --git a/dfm/examples/automodel/finetune/finetune.py b/examples/automodel/finetune/finetune.py similarity index 94% rename from dfm/examples/automodel/finetune/finetune.py rename to examples/automodel/finetune/finetune.py index 7d77162c..1f12c336 100644 --- a/dfm/examples/automodel/finetune/finetune.py +++ b/examples/automodel/finetune/finetune.py @@ -14,7 +14,7 @@ from __future__ import annotations -from Automodel.recipes.train import TrainWan21DiffusionRecipe +from automodel.recipes.train import TrainWan21DiffusionRecipe from nemo_automodel.components.config._arg_parser import parse_args_and_load_config diff --git a/dfm/examples/automodel/finetune/wan2_1_t2v_flow.yaml b/examples/automodel/finetune/wan2_1_t2v_flow.yaml similarity index 100% rename from dfm/examples/automodel/finetune/wan2_1_t2v_flow.yaml rename to examples/automodel/finetune/wan2_1_t2v_flow.yaml diff --git a/dfm/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml b/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml similarity index 100% rename from dfm/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml rename to examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml diff --git a/dfm/examples/automodel/generate/wan_generate.py b/examples/automodel/generate/wan_generate.py similarity index 100% rename from dfm/examples/automodel/generate/wan_generate.py rename to examples/automodel/generate/wan_generate.py diff --git a/examples/dtensor/README.md b/examples/dtensor/README.md deleted file mode 100644 index 709a9755..00000000 --- a/examples/dtensor/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# DTensor Models (NeMo Automodel) - -Examples using NeMo Automodel with distributed tensor parallelism. diff --git a/examples/dtensor/configs/README.md b/examples/dtensor/configs/README.md deleted file mode 100644 index c7df1772..00000000 --- a/examples/dtensor/configs/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Configs - -Configuration files for various Wan model versions. diff --git a/examples/dtensor/scripts/README.md b/examples/dtensor/scripts/README.md deleted file mode 100644 index 0a18e12b..00000000 --- a/examples/dtensor/scripts/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Scripts - -Training scripts for pretraining and finetuning. From 6f64890d8782b3c601909dd78f8bfb59d1fc792d Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 23:08:30 -0800 Subject: [PATCH 15/34] move Signed-off-by: Alexandros Koumparoulis --- .../Automodel => examples/automodel}/pretrain/pretrain.py | 0 .../automodel}/pretrain/wan2_1_t2v_flow.yaml | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {dfm/examples/Automodel => examples/automodel}/pretrain/pretrain.py (100%) rename {dfm/examples/Automodel => examples/automodel}/pretrain/wan2_1_t2v_flow.yaml (100%) diff --git a/dfm/examples/Automodel/pretrain/pretrain.py b/examples/automodel/pretrain/pretrain.py similarity index 100% rename from dfm/examples/Automodel/pretrain/pretrain.py rename to examples/automodel/pretrain/pretrain.py diff --git a/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml similarity index 100% rename from dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml rename to examples/automodel/pretrain/wan2_1_t2v_flow.yaml From ec15d7e53ce1804e37b8d5e6400931a32a9033c5 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 23:09:43 -0800 Subject: [PATCH 16/34] fix Signed-off-by: Alexandros Koumparoulis --- examples/automodel/finetune/finetune.py | 2 +- examples/automodel/pretrain/pretrain.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/automodel/finetune/finetune.py b/examples/automodel/finetune/finetune.py index 1f12c336..d1e840ee 100644 --- a/examples/automodel/finetune/finetune.py +++ b/examples/automodel/finetune/finetune.py @@ -14,7 +14,7 @@ from __future__ import annotations -from automodel.recipes.train import TrainWan21DiffusionRecipe +from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe from nemo_automodel.components.config._arg_parser import parse_args_and_load_config diff --git a/examples/automodel/pretrain/pretrain.py b/examples/automodel/pretrain/pretrain.py index ec054d27..e77f9ede 100644 --- a/examples/automodel/pretrain/pretrain.py +++ b/examples/automodel/pretrain/pretrain.py @@ -14,7 +14,7 @@ from __future__ import annotations -from automodel.recipes.train import TrainWan21DiffusionRecipe +from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe from nemo_automodel.components.config._arg_parser import parse_args_and_load_config From c2299ce512a77ba05c48e1317a099c47818372b3 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 23:13:39 -0800 Subject: [PATCH 17/34] fix Signed-off-by: Alexandros Koumparoulis --- examples/automodel/finetune/finetune.py | 3 ++- examples/automodel/pretrain/pretrain.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/automodel/finetune/finetune.py b/examples/automodel/finetune/finetune.py index d1e840ee..5c9da942 100644 --- a/examples/automodel/finetune/finetune.py +++ b/examples/automodel/finetune/finetune.py @@ -14,9 +14,10 @@ from __future__ import annotations -from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe from nemo_automodel.components.config._arg_parser import parse_args_and_load_config +from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe + def main(default_config_path="/opt/DFM/dfm/examples/Automodel/finetune/wan2_1_t2v_flow.yaml"): cfg = parse_args_and_load_config(default_config_path) diff --git a/examples/automodel/pretrain/pretrain.py b/examples/automodel/pretrain/pretrain.py index e77f9ede..f7a38930 100644 --- a/examples/automodel/pretrain/pretrain.py +++ b/examples/automodel/pretrain/pretrain.py @@ -14,9 +14,10 @@ from __future__ import annotations -from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe from nemo_automodel.components.config._arg_parser import parse_args_and_load_config +from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe + def main(default_config_path="/opt/DFM/dfm/examples/Automodel/pretrain/wan2_1_t2v_flow.yaml"): cfg = parse_args_and_load_config(default_config_path) From d66290599a7b71f035eefd6e2eca1cf26bc9553b Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 23:15:03 -0800 Subject: [PATCH 18/34] fix Signed-off-by: Alexandros Koumparoulis --- examples/automodel/pretrain/wan2_1_t2v_flow.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml index 0c44244f..88eabc76 100644 --- a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml +++ b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml @@ -62,4 +62,4 @@ checkpoint: checkpoint_dir: /opt/DFM/wan_t2v_flow_outputs_base_recipe_fsdp_run_1/ model_save_format: torch_save save_consolidated: false - restore_from: null \ No newline at end of file + restore_from: nul \ No newline at end of file From 8b727473cba19fb72f77db383606cdcd0c9b0594 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 23:16:18 -0800 Subject: [PATCH 19/34] fix Signed-off-by: Alexandros Koumparoulis --- examples/automodel/pretrain/wan2_1_t2v_flow.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml index 88eabc76..caf87ab6 100644 --- a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml +++ b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml @@ -62,4 +62,5 @@ checkpoint: checkpoint_dir: /opt/DFM/wan_t2v_flow_outputs_base_recipe_fsdp_run_1/ model_save_format: torch_save save_consolidated: false - restore_from: nul \ No newline at end of file + restore_from: null + From 99aa19ad9551786146309009290e37fe97c07b2e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Wed, 12 Nov 2025 23:17:56 -0800 Subject: [PATCH 20/34] fix Signed-off-by: Alexandros Koumparoulis --- examples/automodel/pretrain/wan2_1_t2v_flow.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml index caf87ab6..113d500f 100644 --- a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml +++ b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml @@ -63,4 +63,3 @@ checkpoint: model_save_format: torch_save save_consolidated: false restore_from: null - From 501c3d1ac263e3815fa15da30ae0ed1f63cb6330 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Thu, 13 Nov 2025 12:41:26 -0800 Subject: [PATCH 21/34] fix imports Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py | 2 +- dfm/src/automodel/datasets/__init__.py | 2 +- dfm/src/automodel/flow_matching/training_step_t2v.py | 2 +- dfm/src/automodel/recipes/train.py | 4 ++-- examples/automodel/generate/wan_generate.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py index 2f6351b7..2d6f1756 100644 --- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py +++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py @@ -19,7 +19,7 @@ import torch import torch.nn as nn -from Automodel.distributed.dfm_parallelizer import WanParallelizationStrategy +from dfm.src.automodel.distributed.dfm_parallelizer import WanParallelizationStrategy from diffusers import DiffusionPipeline, WanPipeline from nemo_automodel.components.distributed import parallelizer from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager diff --git a/dfm/src/automodel/datasets/__init__.py b/dfm/src/automodel/datasets/__init__.py index a3ef8358..051d4cd2 100644 --- a/dfm/src/automodel/datasets/__init__.py +++ b/dfm/src/automodel/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from Automodel.datasets.wan21 import ( +from dfm.src.automodel.datasets.wan21 import ( MetaFilesDataset, build_node_parallel_sampler, build_wan21_dataloader, diff --git a/dfm/src/automodel/flow_matching/training_step_t2v.py b/dfm/src/automodel/flow_matching/training_step_t2v.py index dc80ce74..8e4cce5c 100644 --- a/dfm/src/automodel/flow_matching/training_step_t2v.py +++ b/dfm/src/automodel/flow_matching/training_step_t2v.py @@ -19,7 +19,7 @@ from typing import Dict, Tuple import torch -from Automodel.flow_matching.time_shift_utils import ( +from dfm.src.automodel.flow_matching.time_shift_utils import ( compute_density_for_timestep_sampling, ) diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py index 5490e595..8f560fa3 100644 --- a/dfm/src/automodel/recipes/train.py +++ b/dfm/src/automodel/recipes/train.py @@ -22,8 +22,8 @@ import torch import torch.distributed as dist import wandb -from Automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline -from Automodel.flow_matching.training_step_t2v import ( +from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline +from dfm.src.automodel.flow_matching.training_step_t2v import ( step_fsdp_transformer_t2v, ) from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig diff --git a/examples/automodel/generate/wan_generate.py b/examples/automodel/generate/wan_generate.py index 2868ef9b..ae5c928a 100644 --- a/examples/automodel/generate/wan_generate.py +++ b/examples/automodel/generate/wan_generate.py @@ -18,7 +18,7 @@ import torch import torch.distributed as dist -from Automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline +from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline from diffusers import AutoencoderKLWan from diffusers.utils import export_to_video from nemo_automodel.components.distributed.init_utils import initialize_distributed From 298ee2d65f655d806f6c76654df120e6c2933c15 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Thu, 13 Nov 2025 12:46:07 -0800 Subject: [PATCH 22/34] lint Signed-off-by: Alexandros Koumparoulis --- .../automodel/_diffusers/auto_diffusion_pipeline.py | 3 ++- dfm/src/automodel/flow_matching/training_step_t2v.py | 1 + dfm/src/automodel/recipes/train.py | 11 ++++++----- examples/automodel/generate/wan_generate.py | 3 ++- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py index 2d6f1756..cb9e9d00 100644 --- a/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py +++ b/dfm/src/automodel/_diffusers/auto_diffusion_pipeline.py @@ -19,12 +19,13 @@ import torch import torch.nn as nn -from dfm.src.automodel.distributed.dfm_parallelizer import WanParallelizationStrategy from diffusers import DiffusionPipeline, WanPipeline from nemo_automodel.components.distributed import parallelizer from nemo_automodel.components.distributed.fsdp2 import FSDP2Manager from nemo_automodel.shared.utils import dtype_from_str +from dfm.src.automodel.distributed.dfm_parallelizer import WanParallelizationStrategy + logger = logging.getLogger(__name__) diff --git a/dfm/src/automodel/flow_matching/training_step_t2v.py b/dfm/src/automodel/flow_matching/training_step_t2v.py index 8e4cce5c..18cce361 100644 --- a/dfm/src/automodel/flow_matching/training_step_t2v.py +++ b/dfm/src/automodel/flow_matching/training_step_t2v.py @@ -19,6 +19,7 @@ from typing import Dict, Tuple import torch + from dfm.src.automodel.flow_matching.time_shift_utils import ( compute_density_for_timestep_sampling, ) diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py index 8f560fa3..474cfe98 100644 --- a/dfm/src/automodel/recipes/train.py +++ b/dfm/src/automodel/recipes/train.py @@ -21,11 +21,6 @@ import torch import torch.distributed as dist -import wandb -from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline -from dfm.src.automodel.flow_matching.training_step_t2v import ( - step_fsdp_transformer_t2v, -) from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig from nemo_automodel.components.loggers.log_utils import setup_logging from nemo_automodel.components.loggers.wandb_utils import suppress_wandb_log_messages @@ -36,6 +31,12 @@ from torch.distributed.fsdp import MixedPrecisionPolicy from transformers.utils.hub import TRANSFORMERS_CACHE +import wandb +from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline +from dfm.src.automodel.flow_matching.training_step_t2v import ( + step_fsdp_transformer_t2v, +) + def build_model_and_optimizer( *, diff --git a/examples/automodel/generate/wan_generate.py b/examples/automodel/generate/wan_generate.py index ae5c928a..829ff308 100644 --- a/examples/automodel/generate/wan_generate.py +++ b/examples/automodel/generate/wan_generate.py @@ -18,12 +18,13 @@ import torch import torch.distributed as dist -from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline from diffusers import AutoencoderKLWan from diffusers.utils import export_to_video from nemo_automodel.components.distributed.init_utils import initialize_distributed from nemo_automodel.components.loggers.log_utils import setup_logging +from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline + def parse_args(): parser = argparse.ArgumentParser(description="Wan2.2 T2V FSDP2 generation") From 819d7d60aed06aef49756255d1cda8eaa65a9a0e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Thu, 13 Nov 2025 12:48:12 -0800 Subject: [PATCH 23/34] more lint Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/recipes/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py index 474cfe98..e24aa1c9 100644 --- a/dfm/src/automodel/recipes/train.py +++ b/dfm/src/automodel/recipes/train.py @@ -31,7 +31,6 @@ from torch.distributed.fsdp import MixedPrecisionPolicy from transformers.utils.hub import TRANSFORMERS_CACHE -import wandb from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline from dfm.src.automodel.flow_matching.training_step_t2v import ( step_fsdp_transformer_t2v, From 6554e4748236169cb59a46fed004001d12d65ce5 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Thu, 13 Nov 2025 15:32:42 -0800 Subject: [PATCH 24/34] fix import Signed-off-by: Alexandros Koumparoulis --- examples/automodel/finetune/wan2_1_t2v_flow.yaml | 2 +- examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml | 2 +- examples/automodel/pretrain/wan2_1_t2v_flow.yaml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/automodel/finetune/wan2_1_t2v_flow.yaml b/examples/automodel/finetune/wan2_1_t2v_flow.yaml index 6b4e3528..cced17b9 100644 --- a/examples/automodel/finetune/wan2_1_t2v_flow.yaml +++ b/examples/automodel/finetune/wan2_1_t2v_flow.yaml @@ -14,7 +14,7 @@ model: data: dataloader: - _target_: Automodel.datasets.build_wan21_dataloader + _target_: dfm.src.automodel.datasets.build_wan21_dataloader meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/ batch_size: 1 num_workers: 2 diff --git a/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml b/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml index 16d4793a..20539da5 100644 --- a/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml +++ b/examples/automodel/finetune/wan2_1_t2v_flow_multinode.yaml @@ -14,7 +14,7 @@ model: data: dataloader: - _target_: Automodel.datasets.build_wan21_dataloader + _target_: dfm.src.automodel.datasets.build_wan21_dataloader meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/ batch_size: 1 num_workers: 2 diff --git a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml index 113d500f..eeabb29a 100644 --- a/examples/automodel/pretrain/wan2_1_t2v_flow.yaml +++ b/examples/automodel/pretrain/wan2_1_t2v_flow.yaml @@ -15,7 +15,7 @@ model: data: dataloader: - _target_: Automodel.datasets.build_wan21_dataloader + _target_: dfm.src.automodel.datasets.build_wan21_dataloader meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/ batch_size: 1 num_workers: 2 @@ -51,7 +51,7 @@ fsdp: cp_size: 1 pp_size: 1 dp_replicate_size: 1 - dp_size: 8 + dp_size: none logging: save_every: 1000 From f6600da867ee466a96ea6efd1024399534cc3a7e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Thu, 13 Nov 2025 15:46:06 -0800 Subject: [PATCH 25/34] fix 3rdparty & pyproject Signed-off-by: Alexandros Koumparoulis --- .gitmodules | 3 --- 3rdparty/Automodel | 1 - 3rdparty/Megatron-Bridge | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) delete mode 160000 3rdparty/Automodel diff --git a/.gitmodules b/.gitmodules index 8ad240e1..274454c7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ [submodule "3rdparty/Automodel"] path = 3rdparty/Automodel url = https://github.com/NVIDIA-NeMo/Automodel.git -[submodule "3rdparty/Megatron-Bridge"] - path = 3rdparty/Megatron-Bridge - url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git diff --git a/3rdparty/Automodel b/3rdparty/Automodel deleted file mode 160000 index a5f06522..00000000 --- a/3rdparty/Automodel +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a5f06522d4f8ef67bb9bbdd9502e50ae27d2fee5 diff --git a/3rdparty/Megatron-Bridge b/3rdparty/Megatron-Bridge index 4e4ce420..8e21f81a 160000 --- a/3rdparty/Megatron-Bridge +++ b/3rdparty/Megatron-Bridge @@ -1 +1 @@ -Subproject commit 4e4ce4203589466d0a5b846e12dd24fa74c57f2a +Subproject commit 8e21f81ab961bdb0ad99a275074fe50aae15d2f9 From 4935ec638e9e249dad4b531a3fce30f095c644fc Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 14 Nov 2025 09:46:59 -0800 Subject: [PATCH 26/34] add torch Signed-off-by: Alexandros Koumparoulis --- pyproject.toml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 05a40a68..5e403755 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,11 @@ automodel = [ "nemo-automodel", ] megatron-bridge = ["megatron-bridge"] +torch-cu124 = [ + "torch", + "torchvision", + "torchaudio", +] [tool.setuptools] packages = ["dfm"] @@ -124,6 +129,11 @@ override-dependencies = [ "transformer-engine[pytorch]>=2.9.0a0,<2.10.0", ] +[[tool.uv.index]] +name = "pytorch-cu124" +url = "https://download.pytorch.org/whl/cu124" +explicit = true + [[tool.uv.index]] name = "pypi" url = "https://pypi.org/simple" From c286199f39a1d061161bb15d073f8434b9672154 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 14 Nov 2025 09:47:08 -0800 Subject: [PATCH 27/34] update uv.lock Signed-off-by: Alexandros Koumparoulis --- uv.lock | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/uv.lock b/uv.lock index 02355563..b6443987 100644 --- a/uv.lock +++ b/uv.lock @@ -3494,6 +3494,11 @@ test = [ { name = "pytest-mock" }, { name = "pytest-runner" }, ] +torch-cu124 = [ + { name = "torch", marker = "sys_platform == 'never'" }, + { name = "torchaudio" }, + { name = "torchvision", marker = "sys_platform == 'never'" }, +] [package.metadata] requires-dist = [ @@ -3542,6 +3547,11 @@ test = [ { name = "pytest-mock", specifier = ">=3.14.0" }, { name = "pytest-runner", specifier = ">=6.0.1" }, ] +torch-cu124 = [ + { name = "torch" }, + { name = "torchaudio" }, + { name = "torchvision" }, +] [[package]] name = "networkx" @@ -6239,7 +6249,7 @@ wheels = [ [[package]] name = "torch" -version = "2.9.1" +version = "2.9.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, @@ -6262,6 +6272,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/a7/b888635fbb6ae951cffd41e1318966cbed96ec762b4999815ab68269e23f/torchao-0.14.1-py3-none-any.whl", hash = "sha256:c9896e14531817bc2ca6847b3fe71c42592ab80a43628b36668b2d6d6713fb5b", size = 1067611, upload-time = "2025-10-24T01:03:01.357Z" }, ] +[[package]] +name = "torchaudio" +version = "2.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "torch", marker = "sys_platform == 'never'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/1c/87/7de58c8f4c1946ec4d9070354eae73d1e4f3d2426e5cfa45febbd8451ce5/torchaudio-2.9.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd13541197e035338bd43225b2067532056486d357c661e12d49ace4fc37f8bb", size = 805912, upload-time = "2025-11-12T15:25:47.857Z" }, + { url = "https://files.pythonhosted.org/packages/6d/1b/680ca01211a39746aedf54e475783f846fbd7961dfeb17bce7d123f931f0/torchaudio-2.9.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:31ec46b718b7caa0182221bfb42e2ad223947b752a996dcdc0388c34a678c966", size = 472829, upload-time = "2025-11-12T15:25:46.519Z" }, + { url = "https://files.pythonhosted.org/packages/c1/ee/d71e6d78d203d72f99c426fbbf2bcd801cf084d8f1891bb1f42c95bc5ec5/torchaudio-2.9.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ee11695b367f64638b4a0340cc9abb9be2173c6537bfe4ab286c6fbff68a1444", size = 2055454, upload-time = "2025-11-12T15:25:50.519Z" }, + { url = "https://files.pythonhosted.org/packages/19/43/dcfadd58a21704835da8bcc43bbb999887a7a1f8965aab527bd50459272c/torchaudio-2.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:acffac66d0908baa4ef16ce5ce6d2a7bc10c2534fce719b146744f306ba08c4a", size = 663868, upload-time = "2025-11-12T15:25:51.755Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6b/34e489fcb4adc4b571a166f2670cc7f156cbe3337867a892fade0a1a5224/torchaudio-2.9.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6e3f5943135701168d30196e2befd46290180cdbb9ee508b167730d51f43208f", size = 807349, upload-time = "2025-11-12T15:25:57.843Z" }, + { url = "https://files.pythonhosted.org/packages/a6/52/66830da8b638368bc0aef064f3307c88d28b526ff8e60a1fda681466b1b3/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d192cf3b1b677f6666dad60caf0ce7bab66965751570c694645dd905a6c61724", size = 474291, upload-time = "2025-11-12T15:25:45.21Z" }, + { url = "https://files.pythonhosted.org/packages/cb/6f/d8f1f36c9f63ddef78f00f8f8ddb9638128ceb5f6824c28bead5af48fc63/torchaudio-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8327e21f51dced2b6de3ac6a63f04bae9be9bc213e151f85c76164568c7ebc3d", size = 2058677, upload-time = "2025-11-12T15:25:53.09Z" }, + { url = "https://files.pythonhosted.org/packages/c3/ef/0ec42e783774bd1dda8bc2489e18b3e9c0a250384e0131cec9f35949f385/torchaudio-2.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:b41339a71b186bad238d94cfb68d4c202db0033088a7b824ce5484674bf67057", size = 664681, upload-time = "2025-11-12T15:25:59.08Z" }, + { url = "https://files.pythonhosted.org/packages/f1/83/71cbadd7b66753818b5775f2088bad4f721d581de276996df4968000a626/torchaudio-2.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7581ef170794c599aed55918e00d0acd9e5c9a0f19400c9a9a840955180365c5", size = 808098, upload-time = "2025-11-12T15:26:01.408Z" }, + { url = "https://files.pythonhosted.org/packages/ef/2d/32e8bec360459107f9b451cc1a5b6fdd5f1d3e653e65a111502084f21e3a/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:742f9d24db5f1f46d8c7e29c599fe55b866d92c4a8181fcb95eab12da225ceb0", size = 474604, upload-time = "2025-11-12T15:25:49.122Z" }, + { url = "https://files.pythonhosted.org/packages/fe/0d/b5af1d55ede1ca07769a2cf71256073d8958e2a5521fc734fc19f5343283/torchaudio-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:4533fdafba73d7bcfcb5f1225b2cc8974a290ed0fe54c44638d6f440e91b8999", size = 2059899, upload-time = "2025-11-12T15:26:19.363Z" }, + { url = "https://files.pythonhosted.org/packages/2e/7c/df90eb0b337cbad59296ed91778e32be069330f5186256d4ce9ea603d324/torchaudio-2.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:923dccc67be4a6cbb45c3dcc2d69ee182bda75b09b69bc88cd3bcdfc739883a2", size = 665337, upload-time = "2025-11-12T15:26:07.407Z" }, + { url = "https://files.pythonhosted.org/packages/c0/1b/3321ad6379ac2d968064704e8d015c31ccae5d1ece070f87fb44b17d90e6/torchaudio-2.9.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bb69557484c92513a980027ec4cb314b0f43cf4442bbfd97440e66528dbad22d", size = 808136, upload-time = "2025-11-12T15:26:00.276Z" }, + { url = "https://files.pythonhosted.org/packages/76/e2/fe55b3882157fd57aa131f5bcad90f0329be90827e1c0e0c482662ddef38/torchaudio-2.9.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ba2799ceec5e4373a0aa26df30d608f1eaaefd8ac4a7ae0c3446f63106f5b5a5", size = 474349, upload-time = "2025-11-12T15:26:02.78Z" }, + { url = "https://files.pythonhosted.org/packages/74/d3/0b090c03cac5a20691507e0945589a696fb10402ccd2457eea47dbf8a71b/torchaudio-2.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:bc3c8e9a240bfad8bc61f769324a4f3ce5d60eec161369d457c595c35dbb10c7", size = 2060343, upload-time = "2025-11-12T15:26:03.88Z" }, + { url = "https://files.pythonhosted.org/packages/a0/db/2555cfd428f4bf09a4df1c6f9204d0acc217c46edb35776c16e7a2a9a1c9/torchaudio-2.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:13ee96ea9bbbc85e198cb671273af06f010e6981d7b912d001eef6bc74e23f4f", size = 665301, upload-time = "2025-11-12T15:26:04.952Z" }, + { url = "https://files.pythonhosted.org/packages/0c/58/e82d8b5f447abdddc950965f1395f36baef3602643dd069100c6369ba73e/torchaudio-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9290f6a6409deb1f9113d5aef97ec646eeee6410b6bcc57ab8b57066b54da7c1", size = 813456, upload-time = "2025-11-12T15:26:13.963Z" }, + { url = "https://files.pythonhosted.org/packages/ce/45/dd9ad6af9bb595095cd98028d270f933760968b92a3497282e31289ef3b4/torchaudio-2.9.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:eeae7ca60b64c4bfb78fbd104a089d072b151423d5d2f90da1da00787f03b800", size = 476577, upload-time = "2025-11-12T15:26:09.54Z" }, + { url = "https://files.pythonhosted.org/packages/79/97/c49aeb01d8a9ced2b8215a38b69b8eafd1afe295a487a73b7030c6ff3396/torchaudio-2.9.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:5f445e896215e6f7bba497dc68aab1e6cb077ae0ab3a90095067f16df6a9bb98", size = 2062158, upload-time = "2025-11-12T15:26:10.487Z" }, + { url = "https://files.pythonhosted.org/packages/ba/70/30b2a0ecca2a0a5e6a8cee8952fdea3872854ea5bcd86fe3df369fdc2543/torchaudio-2.9.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c558ba70d548f7491245ed7a35310f6310d83fc7591f073ab5fed9fd38cef987", size = 669253, upload-time = "2025-11-12T15:26:06.285Z" }, + { url = "https://files.pythonhosted.org/packages/5b/38/0dabf362f946ab5773d3db3322718d652d70ad12a82f500d54c6c8b9cc88/torchaudio-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:69a582650279ee16ff9087f99b4234fe5d766e1bf7f0be352db5f46991854c1e", size = 810496, upload-time = "2025-11-12T15:26:11.515Z" }, + { url = "https://files.pythonhosted.org/packages/05/1c/e05a32ee6868dc05463242db672f23dba5d042423fefcf294db4dac343a8/torchaudio-2.9.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:9c0d004f784c49078017f8217fdc901df0eb9724e50fb269b3a6c99b1d4eae75", size = 474566, upload-time = "2025-11-12T15:26:08.628Z" }, + { url = "https://files.pythonhosted.org/packages/15/52/8cec1fe90f05b888f9060467e1eb8c27f9295b8729a83d443e3bd7c471d3/torchaudio-2.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d2743b28ff5538d5fdf2ff6657d392852ccdfe640ede46f566b2907ca32d8dca", size = 2060358, upload-time = "2025-11-12T15:26:12.885Z" }, + { url = "https://files.pythonhosted.org/packages/04/73/6ba396813d714f895f86c82be61b590fbe14255ebe6866f5ea5916c075a3/torchaudio-2.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:234c7a9d4d0a6ed735cd37965baa9a89ca36bdbebece8a6a5ff7727acbb43026", size = 665039, upload-time = "2025-11-12T15:26:18.308Z" }, + { url = "https://files.pythonhosted.org/packages/9c/f6/237e00a04dea497a40a8567d024dfb39193abec3ca3695ad51919ad633d1/torchaudio-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e13cb38971ac259fc4e102282a3e48f6df5f0ab00eb785ca5155e3392d1e86f1", size = 813463, upload-time = "2025-11-12T15:26:16.261Z" }, + { url = "https://files.pythonhosted.org/packages/57/99/5fcd46a80086030899badeb5a934fab337c88325b3f68c60faa0b672d4d2/torchaudio-2.9.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:35c96ed1011b50eaf17948da173b09450cdc5bb7f908687571adb4a4c072c05e", size = 476577, upload-time = "2025-11-12T15:26:17.355Z" }, + { url = "https://files.pythonhosted.org/packages/a4/4c/bc428f71d5ef728fba2ecb151a3a6d187e6f0b9446b76e4f87e46d2206a3/torchaudio-2.9.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:c220c4acf9914cce2dc81c3624d7c84008ef436dc31bcbb89e8f4416d3615a34", size = 2062170, upload-time = "2025-11-12T15:26:20.837Z" }, + { url = "https://files.pythonhosted.org/packages/07/0e/be41f412e1225bdbd9b7fd7f41a20f070c707f5274b82542eeccf6dc2b79/torchaudio-2.9.1-cp314-cp314t-win_amd64.whl", hash = "sha256:cfd12934c7b54b41d4c79dfd26fbfe88fafa9cc5cc77c074e953bb7018d9322c", size = 669265, upload-time = "2025-11-12T15:26:14.976Z" }, +] + [[package]] name = "torchdata" version = "0.11.0" @@ -6291,7 +6339,7 @@ wheels = [ [[package]] name = "torchvision" -version = "0.24.1" +version = "0.24.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" }, From 6c6e35d289e2ab9fdafeaa77b986fe2ca6e44012 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 14 Nov 2025 09:48:12 -0800 Subject: [PATCH 28/34] fix Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/recipes/train.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py index e24aa1c9..8f9f4c34 100644 --- a/dfm/src/automodel/recipes/train.py +++ b/dfm/src/automodel/recipes/train.py @@ -31,6 +31,7 @@ from torch.distributed.fsdp import MixedPrecisionPolicy from transformers.utils.hub import TRANSFORMERS_CACHE +import wandb from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline from dfm.src.automodel.flow_matching.training_step_t2v import ( step_fsdp_transformer_t2v, @@ -119,7 +120,7 @@ def build_model_and_optimizer( logging.info("[INFO] NeMoAutoDiffusion setup complete (pipeline + optimizer)") - return pipe, optimizer, fsdp2_manager.device_mesh + return pipe, optimizer, getattr(fsdp2_manager, "device_mesh", None) def build_lr_scheduler( @@ -272,6 +273,9 @@ def setup(self): raise RuntimeError("Training dataloader is empty; cannot proceed with training") # Derive DP size consistent with model parallel config + tp_size = fsdp_cfg.get("tp_size", 1) + cp_size = fsdp_cfg.get("cp_size", 1) + pp_size = fsdp_cfg.get("pp_size", 1) denom = max(1, tp_size * cp_size * pp_size) self.dp_size = fsdp_cfg.get("dp_size", None) if self.dp_size is None: From 3f8b64fe1ca66a53f06842faeb12568ac65bc57e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 14 Nov 2025 09:56:32 -0800 Subject: [PATCH 29/34] update Signed-off-by: Alexandros Koumparoulis --- uv.lock | 223 +++----------------------------------------------------- 1 file changed, 9 insertions(+), 214 deletions(-) diff --git a/uv.lock b/uv.lock index b6443987..5a28703f 100644 --- a/uv.lock +++ b/uv.lock @@ -2656,7 +2656,8 @@ wheels = [ [[package]] name = "megatron-bridge" -source = { directory = "3rdparty/Megatron-Bridge" } +version = "0.3.0rc0" +source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?rev=b245756e35943083a77aff6217fe60dd1704f6ca#b245756e35943083a77aff6217fe60dd1704f6ca" } dependencies = [ { name = "causal-conv1d" }, { name = "datasets" }, @@ -2678,70 +2679,10 @@ dependencies = [ { name = "wandb" }, ] -[package.metadata] -requires-dist = [ - { name = "causal-conv1d" }, - { name = "datasets" }, - { name = "hydra-core", specifier = ">1.3,<=1.3.2" }, - { name = "mamba-ssm" }, - { name = "megatron-core", extras = ["dev", "mlm"], directory = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM" }, - { name = "nemo-run", marker = "extra == 'recipes'", specifier = ">=0.5.0a0,<0.6.0" }, - { name = "nvdlfw-inspect", marker = "extra == 'tensor-inspect'", specifier = "==0.2.1" }, - { name = "nvidia-resiliency-ext", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=54f85fe422d296cf04ea524130014bd3a2c3add1" }, - { name = "omegaconf", specifier = ">=2.3.0" }, - { name = "pyyaml", specifier = ">=6.0.2" }, - { name = "qwen-vl-utils" }, - { name = "regex", specifier = ">=2024.11.6" }, - { name = "rich" }, - { name = "six", specifier = ">=1.17.0" }, - { name = "tensorboard", specifier = ">=2.19.0" }, - { name = "tqdm", specifier = ">=4.67.1" }, - { name = "transformer-engine", extras = ["pytorch"], git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" }, - { name = "transformers", specifier = ">=4.57.1" }, - { name = "typing-extensions" }, - { name = "wandb", specifier = ">=0.19.10" }, -] -provides-extras = ["recipes", "tensor-inspect"] - -[package.metadata.requires-dev] -build = [ - { name = "cython", specifier = ">=3.0.0" }, - { name = "ninja" }, - { name = "numpy", specifier = "<2.0.0" }, - { name = "nvidia-mathdx" }, - { name = "pybind11" }, - { name = "setuptools" }, - { name = "torch" }, -] -dev = [ - { name = "mypy", specifier = ">=1.8.0" }, - { name = "pre-commit", specifier = ">=3.6.0" }, - { name = "ruff", specifier = ">=0.9.9" }, -] -docs = [ - { name = "myst-parser", specifier = ">=4.0.1" }, - { name = "nvidia-sphinx-theme", specifier = ">=0.0.8" }, - { name = "sphinx", specifier = ">=8.1.3" }, - { name = "sphinx-autobuild", specifier = ">=2024.10.3" }, - { name = "sphinx-autodoc2", specifier = ">=0.5.0" }, - { name = "sphinx-copybutton", specifier = ">=0.5.2" }, - { name = "sphinxcontrib-mermaid" }, -] -test = [ - { name = "click" }, - { name = "coverage", specifier = ">=7.8.1" }, - { name = "flake8", specifier = ">=7.2.0" }, - { name = "pygithub" }, - { name = "pylint", specifier = ">=3.3.7" }, - { name = "pytest", specifier = ">=8.3.5" }, - { name = "pytest-mock", specifier = ">=3.14.0" }, - { name = "pytest-runner", specifier = ">=6.0.1" }, - { name = "pytest-timeout", specifier = ">=2.4.0" }, -] - [[package]] name = "megatron-core" -source = { directory = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM" } +version = "0.16.0rc0" +source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?subdirectory=3rdparty%2FMegatron-LM&rev=b245756e35943083a77aff6217fe60dd1704f6ca#b245756e35943083a77aff6217fe60dd1704f6ca" } dependencies = [ { name = "numpy" }, { name = "packaging" }, @@ -2781,94 +2722,6 @@ mlm = [ { name = "wandb" }, ] -[package.metadata] -requires-dist = [ - { name = "av", marker = "extra == 'dev'", specifier = "<16.0.0" }, - { name = "causal-conv1d", marker = "extra == 'dev'", specifier = "~=1.5" }, - { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" }, - { name = "einops", marker = "extra == 'lts'" }, - { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=cf9909b777ffac18e05b67a6708282cadc000942" }, - { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.3.2" }, - { name = "flashinfer-python", marker = "extra == 'dev'" }, - { name = "flask-restful", marker = "extra == 'mlm'" }, - { name = "mamba-ssm", marker = "extra == 'dev'", specifier = "~=2.2" }, - { name = "megatron-energon", extras = ["av-decode"], marker = "extra == 'dev'", specifier = "~=6.0" }, - { name = "multi-storage-client", marker = "extra == 'dev'", specifier = "~=0.27" }, - { name = "numpy", specifier = "<2.0.0" }, - { name = "nv-grouped-gemm", marker = "extra == 'dev'", specifier = "~=1.1" }, - { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin' and extra == 'dev'", specifier = ">=0.33.0a0,<0.34.0" }, - { name = "nvidia-resiliency-ext", marker = "extra == 'dev'", specifier = ">=0.4.0a0,<0.5.0" }, - { name = "nvtx", marker = "extra == 'dev'", specifier = "~=0.2" }, - { name = "nvtx", marker = "extra == 'lts'" }, - { name = "onnxscript", marker = "extra == 'dev'" }, - { name = "opentelemetry-api", marker = "extra == 'dev'", specifier = "~=1.33.1" }, - { name = "packaging", specifier = ">=24.2" }, - { name = "sentencepiece", marker = "extra == 'mlm'" }, - { name = "setuptools", marker = "extra == 'dev'", specifier = "<80.0.0" }, - { name = "setuptools", marker = "extra == 'lts'", specifier = "<80.0.0" }, - { name = "tensorstore", marker = "extra == 'dev'", specifier = "~=0.1,!=0.1.46,!=0.1.72" }, - { name = "tensorstore", marker = "extra == 'lts'", specifier = "!=0.1.46,!=0.1.72" }, - { name = "tiktoken", marker = "extra == 'mlm'" }, - { name = "torch" }, - { name = "tqdm", marker = "extra == 'dev'" }, - { name = "tqdm", marker = "extra == 'lts'" }, - { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" }, - { name = "transformers", marker = "extra == 'lts'" }, - { name = "transformers", marker = "extra == 'mlm'" }, - { name = "wandb", marker = "extra == 'mlm'" }, - { name = "wget", marker = "extra == 'dev'" }, - { name = "wget", marker = "extra == 'lts'" }, - { name = "zarr", marker = "extra == 'lts'" }, -] -provides-extras = ["mlm", "dev", "lts"] - -[package.metadata.requires-dev] -build = [ - { name = "cython", specifier = ">=3.0.0" }, - { name = "hatchling" }, - { name = "nvidia-mathdx" }, - { name = "packaging", specifier = ">=24.2" }, - { name = "pybind11" }, - { name = "setuptools", specifier = "<80.0.0" }, - { name = "torch" }, -] -ci = [ - { name = "pandas" }, - { name = "python-gitlab" }, - { name = "slack-sdk" }, -] -docs = [ - { name = "myst-parser" }, - { name = "nvidia-sphinx-theme" }, - { name = "sphinx" }, - { name = "sphinx-autobuild" }, - { name = "sphinx-autodoc2" }, - { name = "sphinx-copybutton" }, -] -flash-mla = [{ name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" }] -linting = [ - { name = "black", specifier = "==24.4.2" }, - { name = "flake8", specifier = "==7.1.0" }, - { name = "isort", specifier = "==5.13.2" }, - { name = "pylint", specifier = "==3.2.6" }, - { name = "ruff", specifier = "~=0.9.0" }, -] -test = [ - { name = "coverage" }, - { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=01a9a8ba360f7b2908728ad0516e0ad9d936966d" }, - { name = "nltk" }, - { name = "pydantic" }, - { name = "pygithub" }, - { name = "pytest", specifier = "==8.3.5" }, - { name = "pytest-asyncio" }, - { name = "pytest-cov" }, - { name = "pytest-mock" }, - { name = "pytest-random-order" }, - { name = "pyyaml" }, - { name = "tensorboard" }, - { name = "wrapt" }, -] - [[package]] name = "megatron-energon" version = "6.0.1" @@ -3360,7 +3213,8 @@ wheels = [ [[package]] name = "nemo-automodel" -source = { directory = "3rdparty/Automodel" } +version = "0.1.0rc0" +source = { git = "https://github.com/NVIDIA-NeMo/Automodel.git?rev=17055ab4fe820ba0e4c13aed8fd2c810b6551bf7#17055ab4fe820ba0e4c13aed8fd2c810b6551bf7" } dependencies = [ { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "datasets" }, @@ -3380,65 +3234,6 @@ dependencies = [ { name = "wandb" }, ] -[package.metadata] -requires-dist = [ - { name = "backoff", marker = "extra == 'vlm'" }, - { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = "==0.45.5" }, - { name = "datasets", specifier = ">=4.0.0" }, - { name = "diffusers" }, - { name = "flash-attn", marker = "extra == 'fa'", specifier = "<=2.8.3" }, - { name = "ftfy" }, - { name = "imageio-ffmpeg" }, - { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = ">=0.5.9" }, - { name = "megatron-fsdp" }, - { name = "mistral-common", extras = ["opencv"], marker = "extra == 'vlm'" }, - { name = "mlflow" }, - { name = "numba", marker = "extra == 'vlm'" }, - { name = "numpy", marker = "extra == 'vlm'" }, - { name = "opencv-python-headless", specifier = "==4.10.0.84" }, - { name = "pillow", marker = "extra == 'vlm'" }, - { name = "pybind11" }, - { name = "pyyaml" }, - { name = "qwen-omni-utils", marker = "extra == 'vlm'" }, - { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, - { name = "timm", marker = "extra == 'vlm'", specifier = "==1.0.16" }, - { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" }, - { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" }, - { name = "torchao" }, - { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, - { name = "torchdata" }, - { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'moe'", specifier = "==2.8.0" }, - { name = "transformers", specifier = "<=4.57.1" }, - { name = "wandb" }, -] -provides-extras = ["vlm", "fa", "moe"] - -[package.metadata.requires-dev] -build = [ - { name = "setuptools" }, - { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" }, - { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" }, -] -dev = [{ name = "cut-cross-entropy", git = "https://github.com/apple/ml-cross-entropy.git?rev=87a86ab" }] -docs = [ - { name = "myst-parser" }, - { name = "nvidia-sphinx-theme" }, - { name = "sphinx" }, - { name = "sphinx-autobuild" }, - { name = "sphinx-autodoc2" }, - { name = "sphinx-copybutton" }, -] -linting = [ - { name = "import-linter", specifier = "~=2.4" }, - { name = "pre-commit", specifier = ">=4.2.0" }, - { name = "ruff", specifier = "~=0.9.0" }, -] -test = [ - { name = "coverage" }, - { name = "peft" }, - { name = "pytest" }, -] - [[package]] name = "nemo-dfm" source = { editable = "." } @@ -3483,7 +3278,7 @@ docs = [ { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, ] -megatron-bridge = [ +megatronbridge = [ { name = "megatron-bridge" }, ] test = [ @@ -3513,7 +3308,7 @@ requires-dist = [ ] [package.metadata.requires-dev] -automodel = [{ name = "nemo-automodel", directory = "3rdparty/Automodel" }] +automodel = [{ name = "nemo-automodel", git = "https://github.com/NVIDIA-NeMo/Automodel.git?rev=17055ab4fe820ba0e4c13aed8fd2c810b6551bf7" }] build = [ { name = "cython", specifier = ">=3.0.0" }, { name = "ninja" }, @@ -3538,7 +3333,7 @@ docs = [ { name = "sphinx-autodoc2", specifier = ">=0.5.0" }, { name = "sphinx-copybutton", specifier = ">=0.5.2" }, ] -megatron-bridge = [{ name = "megatron-bridge", directory = "3rdparty/Megatron-Bridge" }] +megatronbridge = [{ name = "megatron-bridge", git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?rev=b245756e35943083a77aff6217fe60dd1704f6ca" }] test = [ { name = "coverage", specifier = ">=7.8.1" }, { name = "flake8", specifier = ">=7.2.0" }, From 494c3fb29512b741e40b43182c1c5c01fc432835 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 14 Nov 2025 09:58:56 -0800 Subject: [PATCH 30/34] fix Signed-off-by: Alexandros Koumparoulis --- dfm/src/automodel/recipes/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dfm/src/automodel/recipes/train.py b/dfm/src/automodel/recipes/train.py index 8f9f4c34..5a858fde 100644 --- a/dfm/src/automodel/recipes/train.py +++ b/dfm/src/automodel/recipes/train.py @@ -21,6 +21,7 @@ import torch import torch.distributed as dist +import wandb from nemo_automodel.components.checkpoint.checkpointing import Checkpointer, CheckpointingConfig from nemo_automodel.components.loggers.log_utils import setup_logging from nemo_automodel.components.loggers.wandb_utils import suppress_wandb_log_messages @@ -31,7 +32,6 @@ from torch.distributed.fsdp import MixedPrecisionPolicy from transformers.utils.hub import TRANSFORMERS_CACHE -import wandb from dfm.src.automodel._diffusers.auto_diffusion_pipeline import NeMoWanPipeline from dfm.src.automodel.flow_matching.training_step_t2v import ( step_fsdp_transformer_t2v, From e3290e0bb6af25390dfea3bec89d01313121c4d6 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 17 Nov 2025 09:29:28 -0800 Subject: [PATCH 31/34] revert 3rdparty Signed-off-by: Alexandros Koumparoulis --- .gitmodules | 3 +++ 3rdparty/Automodel | 1 + pyproject.toml | 1 - 3 files changed, 4 insertions(+), 1 deletion(-) create mode 160000 3rdparty/Automodel diff --git a/.gitmodules b/.gitmodules index 274454c7..8ad240e1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "3rdparty/Automodel"] path = 3rdparty/Automodel url = https://github.com/NVIDIA-NeMo/Automodel.git +[submodule "3rdparty/Megatron-Bridge"] + path = 3rdparty/Megatron-Bridge + url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git diff --git a/3rdparty/Automodel b/3rdparty/Automodel new file mode 160000 index 00000000..a5f06522 --- /dev/null +++ b/3rdparty/Automodel @@ -0,0 +1 @@ +Subproject commit a5f06522d4f8ef67bb9bbdd9502e50ae27d2fee5 diff --git a/pyproject.toml b/pyproject.toml index 5e403755..82933f29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,7 +142,6 @@ explicit = true [tool.uv.sources] nemo-automodel = { path = "3rdparty/Automodel" } megatron-bridge = { path = "3rdparty/Megatron-Bridge" } -megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" } transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } nvidia-resiliency-ext = { index = "pypi" } From 90f9bbc3e812ee0d4fae9279c188df670bcb6ddf Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 17 Nov 2025 10:23:06 -0800 Subject: [PATCH 32/34] update uv.lock Signed-off-by: Alexandros Koumparoulis --- uv.lock | 221 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 213 insertions(+), 8 deletions(-) diff --git a/uv.lock b/uv.lock index 5a28703f..ea6de63c 100644 --- a/uv.lock +++ b/uv.lock @@ -2656,8 +2656,7 @@ wheels = [ [[package]] name = "megatron-bridge" -version = "0.3.0rc0" -source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?rev=b245756e35943083a77aff6217fe60dd1704f6ca#b245756e35943083a77aff6217fe60dd1704f6ca" } +source = { directory = "3rdparty/Megatron-Bridge" } dependencies = [ { name = "causal-conv1d" }, { name = "datasets" }, @@ -2679,10 +2678,70 @@ dependencies = [ { name = "wandb" }, ] +[package.metadata] +requires-dist = [ + { name = "causal-conv1d" }, + { name = "datasets" }, + { name = "hydra-core", specifier = ">1.3,<=1.3.2" }, + { name = "mamba-ssm" }, + { name = "megatron-core", extras = ["dev", "mlm"], directory = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM" }, + { name = "nemo-run", marker = "extra == 'recipes'", specifier = ">=0.5.0a0,<0.6.0" }, + { name = "nvdlfw-inspect", marker = "extra == 'tensor-inspect'", specifier = "==0.2.1" }, + { name = "nvidia-resiliency-ext", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=54f85fe422d296cf04ea524130014bd3a2c3add1" }, + { name = "omegaconf", specifier = ">=2.3.0" }, + { name = "pyyaml", specifier = ">=6.0.2" }, + { name = "qwen-vl-utils" }, + { name = "regex", specifier = ">=2024.11.6" }, + { name = "rich" }, + { name = "six", specifier = ">=1.17.0" }, + { name = "tensorboard", specifier = ">=2.19.0" }, + { name = "tqdm", specifier = ">=4.67.1" }, + { name = "transformer-engine", extras = ["pytorch"], git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" }, + { name = "transformers", specifier = ">=4.57.1" }, + { name = "typing-extensions" }, + { name = "wandb", specifier = ">=0.19.10" }, +] +provides-extras = ["recipes", "tensor-inspect"] + +[package.metadata.requires-dev] +build = [ + { name = "cython", specifier = ">=3.0.0" }, + { name = "ninja" }, + { name = "numpy", specifier = "<2.0.0" }, + { name = "nvidia-mathdx" }, + { name = "pybind11" }, + { name = "setuptools" }, + { name = "torch" }, +] +dev = [ + { name = "mypy", specifier = ">=1.8.0" }, + { name = "pre-commit", specifier = ">=3.6.0" }, + { name = "ruff", specifier = ">=0.9.9" }, +] +docs = [ + { name = "myst-parser", specifier = ">=4.0.1" }, + { name = "nvidia-sphinx-theme", specifier = ">=0.0.8" }, + { name = "sphinx", specifier = ">=8.1.3" }, + { name = "sphinx-autobuild", specifier = ">=2024.10.3" }, + { name = "sphinx-autodoc2", specifier = ">=0.5.0" }, + { name = "sphinx-copybutton", specifier = ">=0.5.2" }, + { name = "sphinxcontrib-mermaid" }, +] +test = [ + { name = "click" }, + { name = "coverage", specifier = ">=7.8.1" }, + { name = "flake8", specifier = ">=7.2.0" }, + { name = "pygithub" }, + { name = "pylint", specifier = ">=3.3.7" }, + { name = "pytest", specifier = ">=8.3.5" }, + { name = "pytest-mock", specifier = ">=3.14.0" }, + { name = "pytest-runner", specifier = ">=6.0.1" }, + { name = "pytest-timeout", specifier = ">=2.4.0" }, +] + [[package]] name = "megatron-core" -version = "0.16.0rc0" -source = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?subdirectory=3rdparty%2FMegatron-LM&rev=b245756e35943083a77aff6217fe60dd1704f6ca#b245756e35943083a77aff6217fe60dd1704f6ca" } +source = { directory = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM" } dependencies = [ { name = "numpy" }, { name = "packaging" }, @@ -2722,6 +2781,94 @@ mlm = [ { name = "wandb" }, ] +[package.metadata] +requires-dist = [ + { name = "av", marker = "extra == 'dev'", specifier = "<16.0.0" }, + { name = "causal-conv1d", marker = "extra == 'dev'", specifier = "~=1.5" }, + { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" }, + { name = "einops", marker = "extra == 'lts'" }, + { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=cf9909b777ffac18e05b67a6708282cadc000942" }, + { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.3.2" }, + { name = "flashinfer-python", marker = "extra == 'dev'" }, + { name = "flask-restful", marker = "extra == 'mlm'" }, + { name = "mamba-ssm", marker = "extra == 'dev'", specifier = "~=2.2" }, + { name = "megatron-energon", extras = ["av-decode"], marker = "extra == 'dev'", specifier = "~=6.0" }, + { name = "multi-storage-client", marker = "extra == 'dev'", specifier = "~=0.27" }, + { name = "numpy", specifier = "<2.0.0" }, + { name = "nv-grouped-gemm", marker = "extra == 'dev'", specifier = "~=1.1" }, + { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin' and extra == 'dev'", specifier = ">=0.33.0a0,<0.34.0" }, + { name = "nvidia-resiliency-ext", marker = "extra == 'dev'", specifier = ">=0.4.0a0,<0.5.0" }, + { name = "nvtx", marker = "extra == 'dev'", specifier = "~=0.2" }, + { name = "nvtx", marker = "extra == 'lts'" }, + { name = "onnxscript", marker = "extra == 'dev'" }, + { name = "opentelemetry-api", marker = "extra == 'dev'", specifier = "~=1.33.1" }, + { name = "packaging", specifier = ">=24.2" }, + { name = "sentencepiece", marker = "extra == 'mlm'" }, + { name = "setuptools", marker = "extra == 'dev'", specifier = "<80.0.0" }, + { name = "setuptools", marker = "extra == 'lts'", specifier = "<80.0.0" }, + { name = "tensorstore", marker = "extra == 'dev'", specifier = "~=0.1,!=0.1.46,!=0.1.72" }, + { name = "tensorstore", marker = "extra == 'lts'", specifier = "!=0.1.46,!=0.1.72" }, + { name = "tiktoken", marker = "extra == 'mlm'" }, + { name = "torch" }, + { name = "tqdm", marker = "extra == 'dev'" }, + { name = "tqdm", marker = "extra == 'lts'" }, + { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" }, + { name = "transformers", marker = "extra == 'lts'" }, + { name = "transformers", marker = "extra == 'mlm'" }, + { name = "wandb", marker = "extra == 'mlm'" }, + { name = "wget", marker = "extra == 'dev'" }, + { name = "wget", marker = "extra == 'lts'" }, + { name = "zarr", marker = "extra == 'lts'" }, +] +provides-extras = ["mlm", "dev", "lts"] + +[package.metadata.requires-dev] +build = [ + { name = "cython", specifier = ">=3.0.0" }, + { name = "hatchling" }, + { name = "nvidia-mathdx" }, + { name = "packaging", specifier = ">=24.2" }, + { name = "pybind11" }, + { name = "setuptools", specifier = "<80.0.0" }, + { name = "torch" }, +] +ci = [ + { name = "pandas" }, + { name = "python-gitlab" }, + { name = "slack-sdk" }, +] +docs = [ + { name = "myst-parser" }, + { name = "nvidia-sphinx-theme" }, + { name = "sphinx" }, + { name = "sphinx-autobuild" }, + { name = "sphinx-autodoc2" }, + { name = "sphinx-copybutton" }, +] +flash-mla = [{ name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" }] +linting = [ + { name = "black", specifier = "==24.4.2" }, + { name = "flake8", specifier = "==7.1.0" }, + { name = "isort", specifier = "==5.13.2" }, + { name = "pylint", specifier = "==3.2.6" }, + { name = "ruff", specifier = "~=0.9.0" }, +] +test = [ + { name = "coverage" }, + { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=01a9a8ba360f7b2908728ad0516e0ad9d936966d" }, + { name = "nltk" }, + { name = "pydantic" }, + { name = "pygithub" }, + { name = "pytest", specifier = "==8.3.5" }, + { name = "pytest-asyncio" }, + { name = "pytest-cov" }, + { name = "pytest-mock" }, + { name = "pytest-random-order" }, + { name = "pyyaml" }, + { name = "tensorboard" }, + { name = "wrapt" }, +] + [[package]] name = "megatron-energon" version = "6.0.1" @@ -3213,8 +3360,7 @@ wheels = [ [[package]] name = "nemo-automodel" -version = "0.1.0rc0" -source = { git = "https://github.com/NVIDIA-NeMo/Automodel.git?rev=17055ab4fe820ba0e4c13aed8fd2c810b6551bf7#17055ab4fe820ba0e4c13aed8fd2c810b6551bf7" } +source = { directory = "3rdparty/Automodel" } dependencies = [ { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" }, { name = "datasets" }, @@ -3234,6 +3380,65 @@ dependencies = [ { name = "wandb" }, ] +[package.metadata] +requires-dist = [ + { name = "backoff", marker = "extra == 'vlm'" }, + { name = "bitsandbytes", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = "==0.45.5" }, + { name = "datasets", specifier = ">=4.0.0" }, + { name = "diffusers" }, + { name = "flash-attn", marker = "extra == 'fa'", specifier = "<=2.8.3" }, + { name = "ftfy" }, + { name = "imageio-ffmpeg" }, + { name = "liger-kernel", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'", specifier = ">=0.5.9" }, + { name = "megatron-fsdp" }, + { name = "mistral-common", extras = ["opencv"], marker = "extra == 'vlm'" }, + { name = "mlflow" }, + { name = "numba", marker = "extra == 'vlm'" }, + { name = "numpy", marker = "extra == 'vlm'" }, + { name = "opencv-python-headless", specifier = "==4.10.0.84" }, + { name = "pillow", marker = "extra == 'vlm'" }, + { name = "pybind11" }, + { name = "pyyaml" }, + { name = "qwen-omni-utils", marker = "extra == 'vlm'" }, + { name = "qwen-vl-utils", extras = ["decord"], marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, + { name = "timm", marker = "extra == 'vlm'", specifier = "==1.0.16" }, + { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" }, + { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" }, + { name = "torchao" }, + { name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" }, + { name = "torchdata" }, + { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'moe'", specifier = "==2.8.0" }, + { name = "transformers", specifier = "<=4.57.1" }, + { name = "wandb" }, +] +provides-extras = ["vlm", "fa", "moe"] + +[package.metadata.requires-dev] +build = [ + { name = "setuptools" }, + { name = "torch", marker = "sys_platform != 'darwin'", specifier = "<=2.9.0", index = "https://download.pytorch.org/whl/cu129" }, + { name = "torch", marker = "sys_platform == 'darwin'", specifier = "<=2.9.0", index = "https://pypi.org/simple" }, +] +dev = [{ name = "cut-cross-entropy", git = "https://github.com/apple/ml-cross-entropy.git?rev=87a86ab" }] +docs = [ + { name = "myst-parser" }, + { name = "nvidia-sphinx-theme" }, + { name = "sphinx" }, + { name = "sphinx-autobuild" }, + { name = "sphinx-autodoc2" }, + { name = "sphinx-copybutton" }, +] +linting = [ + { name = "import-linter", specifier = "~=2.4" }, + { name = "pre-commit", specifier = ">=4.2.0" }, + { name = "ruff", specifier = "~=0.9.0" }, +] +test = [ + { name = "coverage" }, + { name = "peft" }, + { name = "pytest" }, +] + [[package]] name = "nemo-dfm" source = { editable = "." } @@ -3308,7 +3513,7 @@ requires-dist = [ ] [package.metadata.requires-dev] -automodel = [{ name = "nemo-automodel", git = "https://github.com/NVIDIA-NeMo/Automodel.git?rev=17055ab4fe820ba0e4c13aed8fd2c810b6551bf7" }] +automodel = [{ name = "nemo-automodel", directory = "3rdparty/Automodel" }] build = [ { name = "cython", specifier = ">=3.0.0" }, { name = "ninja" }, @@ -3333,7 +3538,7 @@ docs = [ { name = "sphinx-autodoc2", specifier = ">=0.5.0" }, { name = "sphinx-copybutton", specifier = ">=0.5.2" }, ] -megatronbridge = [{ name = "megatron-bridge", git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git?rev=b245756e35943083a77aff6217fe60dd1704f6ca" }] +megatronbridge = [{ name = "megatron-bridge", directory = "3rdparty/Megatron-Bridge" }] test = [ { name = "coverage", specifier = ">=7.8.1" }, { name = "flake8", specifier = ">=7.2.0" }, From a0c5367e1a8797c28cce1382c34cc771613ae90f Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 17 Nov 2025 10:26:25 -0800 Subject: [PATCH 33/34] fix Signed-off-by: Alexandros Koumparoulis --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 82933f29..5e403755 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,6 +142,7 @@ explicit = true [tool.uv.sources] nemo-automodel = { path = "3rdparty/Automodel" } megatron-bridge = { path = "3rdparty/Megatron-Bridge" } +megatron-core = { path = "3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/" } transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } nvidia-resiliency-ext = { index = "pypi" } From 7b108d1454e5f9f0f1b5b467070bd866a4f315df Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Mon, 17 Nov 2025 10:49:48 -0800 Subject: [PATCH 34/34] update uv.lock Signed-off-by: Alexandros Koumparoulis --- uv.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/uv.lock b/uv.lock index ea6de63c..b6443987 100644 --- a/uv.lock +++ b/uv.lock @@ -3483,7 +3483,7 @@ docs = [ { name = "sphinx-autodoc2" }, { name = "sphinx-copybutton" }, ] -megatronbridge = [ +megatron-bridge = [ { name = "megatron-bridge" }, ] test = [ @@ -3538,7 +3538,7 @@ docs = [ { name = "sphinx-autodoc2", specifier = ">=0.5.0" }, { name = "sphinx-copybutton", specifier = ">=0.5.2" }, ] -megatronbridge = [{ name = "megatron-bridge", directory = "3rdparty/Megatron-Bridge" }] +megatron-bridge = [{ name = "megatron-bridge", directory = "3rdparty/Megatron-Bridge" }] test = [ { name = "coverage", specifier = ">=7.8.1" }, { name = "flake8", specifier = ">=7.2.0" },