Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
[submodule "3rdparty/Automodel-workspace/Automodel"]
path = 3rdparty/Automodel-workspace/Automodel
url = https://github.com/NVIDIA-NeMo/Automodel.git
branch = yifu/bump-torch-and-hf
branch = main
shallow = true
[submodule "3rdparty/Gym-workspace/Gym"]
path = 3rdparty/Gym-workspace/Gym
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Automodel-workspace/Automodel
Submodule Automodel updated 180 files
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
Submodule Megatron-Bridge updated 714 files
11 changes: 9 additions & 2 deletions 3rdparty/Megatron-Bridge-workspace/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,15 @@
bridge_package_name = "megatron.bridge"

CACHED_DEPENDENCIES = [
"transformers>=5.0.0,<=5.2.0",
"transformers>=5.0.0,<=5.3.0",
"peft>=0.18.1",
"datasets>=2.20.0",
"accelerate",
"diffusers>=0.36.0",
"peft>=0.18.0",
"einops",
"imageio",
"imageio-ffmpeg",
"omegaconf>=2.3.0",
"tensorboard>=2.19.0",
"typing-extensions",
Expand All @@ -44,12 +50,13 @@
# TODO(https://github.com/NVIDIA-NeMo/RL/issues/2111): upgrade to core_cu13 when we move to CUDA 13 base container
"transformer-engine[pytorch,core_cu12]",
"mamba-ssm",
"nvidia-resiliency-ext~=0.4.1",
"nvidia-resiliency-ext~=0.5.0",
"causal-conv1d",
"flash-linear-attention",
"timm",
"open-clip-torch>=3.2.0",
"mlflow>=3.5.0",
"comet-ml>=3.50.0",
"torch>=2.6.0",
]

Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/Megatron-LM-workspace/Megatron-LM
Submodule Megatron-LM updated 574 files
5 changes: 2 additions & 3 deletions 3rdparty/Megatron-LM-workspace/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
# Dev dependencies from pyproject.toml
"nvidia-modelopt[torch]; sys_platform != 'darwin'",
# TODO(https://github.com/NVIDIA-NeMo/RL/issues/2111): upgrade to core_cu13 when we move to CUDA 13 base container
"transformer-engine[pytorch,core_cu12]>=2.9.0a0,<2.12.0",
"nvidia-resiliency-ext",
"transformer-engine[pytorch,core_cu12]",
"nvidia-resiliency-ext @ git+https://github.com/NVIDIA/nvidia-resiliency-ext.git@v0.5.0",
"tqdm",
"einops~=0.8",
"tensorstore~=0.1,!=0.1.46,!=0.1.72",
Expand All @@ -60,7 +60,6 @@
"mamba-ssm~=2.2",
"causal-conv1d~=1.5",
"flash-linear-attention~=0.4.0",
"nv-grouped-gemm~=1.1",
"megatron-energon[av_decode]~=6.0",
"av",
"flashinfer-python~=0.5.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ policy:
name: ${policy.model_name}
train_global_batch_size: 32
train_micro_batch_size: 1
max_total_sequence_length: 6000
max_total_sequence_length: 3200
dtensor_cfg:
enabled: false
megatron_cfg:
Expand All @@ -26,8 +26,6 @@ policy:
lr: 1.0e-06
min_lr: 1.0e-06
adam_beta2: 0.999
use_distributed_optimizer: false
use_precision_aware_optimizer: false
scheduler:
lr_warmup_iters: 10
lr_warmup_init: 1.0e-11
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ policy:
sequence_parallel: true
moe_permute_fusion: true
apply_rope_fusion: false
gradient_accumulation_fusion: false
moe_enable_deepep: true
moe_token_dispatcher_type: flex
optimizer:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ policy:
num_layers_in_first_pipeline_stage: 7
num_layers_in_last_pipeline_stage: 6
apply_rope_fusion: false
gradient_accumulation_fusion: false
fp8_cfg:
enabled: true
optimizer:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ policy:
num_layers_in_first_pipeline_stage: 7
num_layers_in_last_pipeline_stage: 6
apply_rope_fusion: false
gradient_accumulation_fusion: false
optimizer:
lr: 1.0e-06
scheduler:
Expand Down
15 changes: 12 additions & 3 deletions nemo_rl/models/dtensor/parallelize.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,9 +489,10 @@ def _parallelize_nm5_h(
"mixer.down_proj": RowwiseParallel(),
}

# Native transformers NemotronH uses model.model.layers, custom uses model.backbone.layers
inner_model = model.backbone if hasattr(model, "backbone") else model.model
# NemotronH uses .backbone (trust_remote_code) or .model (native transformers >= 5.3.0)
inner_model = getattr(model, "backbone", model.model)
layers: torch.nn.ModuleList = inner_model.layers

parallelize_module(model, tp_mesh, model_tp_plan)

for layer in inner_model.layers:
Expand Down Expand Up @@ -522,14 +523,22 @@ def _parallelize_nm5_h(

# do not reshard after forward for root model
# because its parameters will be used in backward immediately
return fully_shard(
result = fully_shard(
model,
mesh=dp_mesh,
mp_policy=mp_policy,
offload_policy=offload_policy,
reshard_after_forward=False,
)

# Register .model so the native transformers forward() (self.model(...)) resolves
# correctly after FSDP2 wrapping, regardless of whether the class uses .backbone
# (trust_remote_code) or .model (native transformers). kernel_patches.py wraps
# the native forward which always calls self.model(...).
result.register_module("model", inner_model)

return result


def _parallelize_model(
model: Union[
Expand Down
6 changes: 6 additions & 0 deletions nemo_rl/models/megatron/community_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ def import_model_from_hf_name(
]
model_provider.pipeline_dtype = megatron_config["pipeline_dtype"]
model_provider.sequence_parallel = megatron_config["sequence_parallel"]
if (
gradient_accumulation_fusion := megatron_config.get(
"gradient_accumulation_fusion"
)
) is not None:
model_provider.gradient_accumulation_fusion = gradient_accumulation_fusion
model_provider.finalize()

from megatron.core import parallel_state
Expand Down
30 changes: 19 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,18 @@ dependencies = [
"sympy>=1.14.0",
"pillow>=11.3.0",
"torchvision==0.25.0",
"transformers==5.2.0",
"transformers==5.3.0",
"num2words>=0.5.14", # for SmolVLM
"mlflow>=3.5.0,<3.6.0",
"nvidia-nvshmem-cu12; sys_platform == 'linux' and (platform_machine == 'x86_64' or platform_machine == 'aarch64')", # for deep_ep build
"swanlab",
"pyzmq",
"decord2",
"nvidia-resiliency-ext",
"nccl4py", # for non-colocated refit
"cuda-bindings", # for non-colocated refit
"pybase64", # for sglang refit
"nvidia-cudnn-cu12==9.19.0.56", # for transformer-engine no build isolation

"nccl4py", # for non-colocated refit
"cuda-bindings", # for non-colocated refit
"pybase64", # for sglang refit
"nvidia-cudnn-cu12==9.19.0.56", # for transformer-engine no build isolation
]

[project.optional-dependencies]
Expand All @@ -65,7 +65,7 @@ automodel = [
# https://github.com/NVIDIA/TransformerEngine/blob/v2.3/transformer_engine/pytorch/attention/dot_product_attention/utils.py#L108
# https://github.com/facebookresearch/xformers/blob/8354497deb2c04c67fbb2e2ad911e86530da0e90/xformers/ops/fmha/flash.py#L76
"flash-attn==2.8.1",
"transformers>=5.0.0",
"transformers>=5.3.0",
"mamba-ssm",
"causal-conv1d",
"nv-grouped-gemm",
Expand Down Expand Up @@ -258,7 +258,7 @@ override-dependencies = [
"opentelemetry-api>=1.33.1",
# vLLM 0.17.0 code is compatible with transformers v5 but the PyPI metadata still declares <5.
# Override until vllm officially relaxes the constraint (https://github.com/vllm-project/vllm/issues/30466).
"transformers==5.2.0",
"transformers==5.3.0",
]
# CVE fixes
constraint-dependencies = [
Expand Down Expand Up @@ -356,6 +356,7 @@ name = "nv-grouped-gemm"
version = "v1.1.4.post7"
requires-dist = ["setuptools", "wheel", "torch", "numpy"]


[[tool.uv.dependency-metadata]]
name = "sgl-kernel"
# This version has to match the version in the commit/rev/tag used
Expand Down Expand Up @@ -423,7 +424,7 @@ requires-dist = [
"torchcodec==0.8.0 ; sys_platform != 'linux' or (sys_platform == 'linux' and platform_machine != 'aarch64' and platform_machine != 'arm64' and platform_machine != 'armv7l')",
"torchvision",
"tqdm",
"transformers==5.2.0",
"transformers==5.3.0",
"uvicorn",
"uvloop",
"xgrammar==0.1.27",
Expand All @@ -440,9 +441,15 @@ name = "megatron-bridge"
# Must stay in sync with 3rdparty/Megatron-Bridge-workspace/setup.py::CACHED_DEPENDENCIES.
version = "0.0.0"
requires-dist = [
"transformers>=5.0.0,<=5.2.0",
"transformers>=5.0.0,<=5.3.0",
"peft>=0.18.1",
"datasets>=2.20.0",
"accelerate",
"diffusers>=0.36.0",
"peft>=0.18.0",
"einops",
"imageio",
"imageio-ffmpeg",
"omegaconf>=2.3.0",
"tensorboard>=2.19.0",
"typing-extensions",
Expand All @@ -459,12 +466,13 @@ requires-dist = [
# TODO(https://github.com/NVIDIA-NeMo/RL/issues/2111): upgrade to core_cu13 when we move to CUDA 13
"transformer-engine[pytorch,core_cu12]",
"mamba-ssm",
"nvidia-resiliency-ext~=0.4.1",
"nvidia-resiliency-ext~=0.5.0",
"causal-conv1d",
"flash-linear-attention",
"timm",
"open-clip-torch>=3.2.0",
"mlflow>=3.5.0",
"comet-ml>=3.50.0",
"torch>=2.6.0",
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
uv run tests/check_metrics.py $JSON_METRICS \
'data["train/loss"]["1"] < 0.69316' \
'data["train/loss"]["15"] < 0.63263' \
'data["train/loss"]["11"] < 0.53' \
'data["train/preference_loss"]["1"] > 0.69314' \
'data["train/preference_loss"]["1"] < 0.69316' \
'data["train/preference_loss"]["15"] < 0.63263' \
'data["train/preference_loss"]["11"] < 0.53' \
'mean(data["timing/train/total_step_time"], -5, -1) < 5'

# Clean up checkpoint directory after successful run to save space.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
# Smoke checks: run completed and loss is finite/reasonable.
uv run tests/check_metrics.py $JSON_METRICS \
'data["train/loss"]["10"] > 0.0' \
'data["train/loss"]["10"] < 20.0'
'data["train/loss"]["1"] < 0.7' \
'data["train/loss"]["10"] < 0.7' \
'data["train/accuracy"]["10"] > 0.56'

# Clean up checkpoint directory after successful run to save space.
rm -rf "$CKPT_DIR"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
'data["train/loss"]["1"] < 0.6' \
'data["train/loss"]["250"] < 0.36' \
'max(data["ray/node.0.gpu.0.mem_gb"]) < 75' \
'mean(data["timing/train/total_step_time"], -6, -1) < 10'
'mean(data["timing/train/total_step_time"], -6, -1) < 10.5'

# Clean up checkpoint directory after successful run to save space.
rm -rf "$CKPT_DIR"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
# Smoke checks: run completed and loss is finite/reasonable.
uv run tests/check_metrics.py $JSON_METRICS \
'data["train/loss"]["10"] > 0.0' \
'data["train/loss"]["10"] < 20.0'
'data["train/loss"]["1"] < 0.49' \
'data["train/loss"]["10"] < 0.45'

# Clean up checkpoint directory after successful run to save space.
rm -rf "$CKPT_DIR"
Expand Down
36 changes: 15 additions & 21 deletions tests/unit/prepare_unit_test_assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,33 +36,27 @@
def build_tiny_nemotron5_h_checkpoint(model_path: str) -> None:
import shutil

import transformers
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

assert transformers.__version__ < "5.3.0", (
"NemotronHConfig is supported in transformers 5.3.0 or later, use NemotronHConfig instead"
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.nemotron_h import NemotronHConfig

config = NemotronHConfig(
layers_block_type=["mamba", "attention", "mamba"],
num_hidden_layers=3,
intermediate_size=32,
hidden_size=256,
num_attention_heads=8,
mamba_num_heads=8,
num_key_value_heads=8,
n_groups=1,
vocab_size=131072,
)

config = AutoConfig.from_pretrained(
"nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True
)
config.hybrid_override_pattern = "M*-"
config.num_hidden_layers = 3
config.intermediate_size = 32
config.hidden_size = 256
config.num_attention_heads = 8
config.mamba_num_heads = 8
config.num_key_value_heads = 8
config.n_groups = 1

model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
model = AutoModelForCausalLM.from_config(config)

tokenizer = AutoTokenizer.from_pretrained(
"nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True
)

# Disable tied weights for transformers 5.2.0 to avoid error
model._tied_weights_keys = None

shutil.rmtree(model_path, ignore_errors=True)
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
Expand Down
Loading
Loading