From 8398b856ee38cfc0bebd01a51d2f3ceff8b7585a Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 29 Aug 2025 06:26:21 +0000 Subject: [PATCH 01/15] feat: add config_tools.py and refactor configs Signed-off-by: Terry Kong compare command Signed-off-by: Terry Kong config changes Signed-off-by: Terry Kong Revert "config changes" This reverts commit 25b87e2c603c56bedbabf3d8898bfe6ec6aa2ae9. Signed-off-by: Terry Kong cleanup Signed-off-by: Terry Kong vlm example Signed-off-by: Terry Kong minimize configs Signed-off-by: Terry Kong Revert "minimize configs" This reverts commit 137548006f003c7380756117db9732209eafd02b. Signed-off-by: Terry Kong minimize configs Signed-off-by: Terry Kong Revert "minimize configs" This reverts commit a4cd8a4870964d0a3a180805980ecdd943bd70a5. Signed-off-by: Terry Kong minimize configs Signed-off-by: Terry Kong force sft configs to use default chat template to match last releases behavior Signed-off-by: Terry Kong reverting select configs to v1 to address Signed-off-by: Terry Kong add pre-commit and add a minimize-check func Signed-off-by: Terry Kong Revert "reverting select configs to v1 to address" This reverts commit d81f806f0c5e6a7370321b6bbae9a87448bc7317. Signed-off-by: Terry Kong Revert "force sft configs to use default chat template to match last releases" This reverts commit be01df7f69f89667b0face375bad561021d6b180. Signed-off-by: Terry Kong Revert "minimize configs" This reverts commit e54f144eef3e6dbe451154a2bbcbcf39099ea3a9. Signed-off-by: Terry Kong --- .pre-commit-config.yaml | 25 +++ tools/config_cli.py | 450 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 475 insertions(+) create mode 100755 tools/config_cli.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7d1a05182d..6b55ea31e2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,3 +46,28 @@ repos: require_serial: true additional_dependencies: [] minimum_pre_commit_version: "2.9.2" + + - repo: local + hooks: + - id: configs-minimize-check-llm + name: minimize-check llm recipes + language: system + pass_filenames: false + entry: bash + args: + - -lc + - | + set -euo pipefail + base="examples/configs/dpo.yaml"; for f in examples/configs/recipes/llm/dpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done + base="examples/configs/grpo_math_1B.yaml"; for f in examples/configs/recipes/llm/grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done + base="examples/configs/sft.yaml"; for f in examples/configs/recipes/llm/sft-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done + - id: configs-minimize-check-vlm + name: minimize-check vlm recipes + language: system + pass_filenames: false + entry: bash + args: + - -lc + - | + set -euo pipefail + base="examples/configs/vlm_grpo_3B.yaml"; for f in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done diff --git a/tools/config_cli.py b/tools/config_cli.py new file mode 100755 index 0000000000..9583a55011 --- /dev/null +++ b/tools/config_cli.py @@ -0,0 +1,450 @@ +#!/usr/bin/env -S uv run --script -q +# /// script +# dependencies = [ +# "omegaconf" +# ] +# /// +"""Utilities for working with YAML configs in this repo. + +Subcommands: + - expand: Resolve a config with OmegaConf interpolation and inheritance. + - minimize: Given a base config and a config, remove keys in the config that + are equal to the base, and ensure a defaults entry pointing to the base + exists. The defaults path in the resulting config is written relative to + the base config file. + - minimize-check: Same args as `minimize` but only checks if minimization + would change the file; exits non-zero if changes are needed. + +Both commands support printing to stdout or in-place editing of the config file. + +Example: + # Expand a config with a root level "defaults" key to see the full config; print to stdout + uv run tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml + + # Expand a config with a root level "defaults" key to see the full config; edit the config in place + uv run tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place + + # Minimize a config and remove all keys that are present in the base config; print to stdout + # uv run tools/config_cli.py minimize + uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml + + # Minimize a config and remove all keys that are present in the base config; edit the config in place + # uv run tools/config_cli.py minimize + uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place + + # Minimize all llm the configs: + for algo in grpo dpo sft; do + base_config=examples/configs/${algo}.yaml + if [[ ${algo} == grpo ]]; then + base_config=examples/configs/grpo_math_1B.yaml + fi + for recipe in examples/configs/recipes/llm/${algo}-*.yaml; do + uv run tools/config_cli.py minimize $base_config $recipe --in-place + done + done + + # Minimize vlm configs: + for recipe in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do + uv run tools/config_cli.py minimize examples/configs/vlm_grpo_3B.yaml $recipe --in-place + done + + # Compare two configs + uv run tools/config_cli.py compare examples/configs/grpo_math_1B.yaml examples/configs/grpo_math_8B.yaml + + # Minimize a config and compare it to not minimzing (should be the same) + uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml >examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized + uv run tools/config_cli.py compare \ + examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml \ + examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized +""" + +import argparse +import sys +from pathlib import Path + +# ============================================================================ +# VENDORED SECTION: Minimal self-contained config loader (no nemo_rl dependency) +# +# Original source: `nemo_rl/utils/config.py` +# - Functions adapted: `_resolve_path`, `load_config_with_inheritance`, `load_config` +# - Purpose: avoid importing from nemo_rl so this script is standalone +# - If upstream changes, consider updating this vendored block accordingly +# ============================================================================ +from typing import Any, Iterable, Optional, Union, cast + +from omegaconf import DictConfig, ListConfig, OmegaConf + + +def _resolve_path(base_path: Path, path: str) -> Path: + if path.startswith("/"): + return Path(path) + return (base_path / path).resolve() + + +def load_config_with_inheritance( + config_path: Union[str, Path], base_dir: Optional[Union[str, Path]] = None +) -> DictConfig: + """Load a YAML config and resolve simple inheritance via a top-level `defaults` key. + + Supports: + - `defaults: parent.yaml` (string) + - `defaults: [parent1.yaml, parent2.yaml]` (list) + - Nested inheritance via parent files with their own `defaults`. + """ + config_path = Path(config_path).resolve() + if base_dir is None: + base_dir = config_path.parent + base_dir = Path(base_dir) + + cfg = OmegaConf.load(config_path) + if not isinstance(cfg, DictConfig): + raise TypeError( + f"Config at {config_path} must be a mapping (DictConfig), got {type(cfg)}" + ) + + if "defaults" in cfg: + defaults = cfg.pop("defaults") + if isinstance(defaults, (str, Path)): + defaults_list = [str(defaults)] + elif isinstance(defaults, ListConfig): + defaults_list = [str(d) for d in defaults] + elif isinstance(defaults, list): + defaults_list = [str(d) for d in defaults] + else: + raise TypeError( + f"Unsupported type for defaults: {type(defaults)} in {config_path}" + ) + + merged: DictConfig = OmegaConf.create({}) # type: ignore[assignment] + for default_entry in defaults_list: + parent_path = _resolve_path(base_dir, str(default_entry)) + parent_cfg = load_config_with_inheritance(parent_path, base_dir) + merged = cast(DictConfig, OmegaConf.merge(merged, parent_cfg)) + + cfg = cast(DictConfig, OmegaConf.merge(merged, cfg)) + + return cfg + + +def load_config(config_path: Union[str, Path]) -> DictConfig: + return load_config_with_inheritance(config_path) + + +# ============================================================================ +# END VENDORED SECTION +# ============================================================================ + + +def _dict_like(obj: Any) -> bool: + return isinstance(obj, dict) + + +def _list_like(obj: Any) -> bool: + return isinstance(obj, list) + + +REMOVE = object() + + +def _prune_equal(a: Any, b: Any) -> Any: + """Return a copy of `a` with entries equal to `b` removed. + + - If both are dicts: recursively prune and drop keys whose subtree is empty + after pruning or equal. + - If both are lists of same length: recursively prune by index and drop list + if becomes entirely empty or equal. + - Else: if equal, return a sentinel indicating removal; otherwise return `a`. + """ + if _dict_like(a) and _dict_like(b): + out: dict[str, Any] = {} + a_dict: dict[str, Any] = a # type: ignore[assignment] + b_dict: dict[str, Any] = b # type: ignore[assignment] + for key, a_val in a_dict.items(): + if key in b_dict: + pruned = _prune_equal(a_val, b_dict[key]) + if pruned is REMOVE: + # equal, skip + continue + # keep if subtree has content + if pruned != {} and pruned != []: + out[key] = pruned + else: + out[key] = a_val + return out + + if _list_like(a) and _list_like(b) and len(a) == len(b): + # Only remove if entire list equals base; avoid partial list pruning + # to prevent semantic changes in ordered config sections. + if a == b: + return REMOVE + return a + + # Base types + if a == b: + return REMOVE + return a + + +def _ensure_defaults_relative( + child_path: Path, base_path: Path, child_cfg: dict[str, Any] +) -> None: + """Ensure `defaults:` points to the base, with a path relative to the base config file. + + The path we store must be a string such that, when the resulting minimized + config sits at `child_path`, the `defaults` string references the base + config location. The instruction asks that the defaults path in the resulting + config is relative to the base config; we interpret this as "express `base` + relative to the directory of the base file", then make that path relative + to the child config so that hydra resolution works from the child file. + """ + # Compute a relative reference from child dir to base file + import os + + rel_from_child_to_base = os.path.relpath( + str(base_path), start=str(child_path.parent) + ) + + existing = child_cfg.get("defaults") + if existing is None: + child_cfg["defaults"] = str(rel_from_child_to_base) + return + # Normalize various forms: string, single list element, list + if isinstance(existing, str): + existing_list: list[Any] = [existing] + else: + existing_list = list(existing) if isinstance(existing, Iterable) else [existing] + # Put our base at the first position if not present + if str(rel_from_child_to_base) not in [str(x) for x in existing_list]: + existing_list.insert(0, str(rel_from_child_to_base)) + # If it's a single element list, collapse to string for this repo's style + if len(existing_list) == 1: + child_cfg["defaults"] = existing_list[0] + else: + child_cfg["defaults"] = existing_list + + +def expand(args: argparse.Namespace) -> int: + # Merge defaults/inheritance using repo loader; preserve ${...} + cfg = load_config(str(Path(args.config).resolve())) + # Preserve ${...} by not resolving + text = OmegaConf.to_yaml(cfg) + if args.in_place: + Path(args.config).write_text(text) + else: + print(text + ("\n" if not text.endswith("\n") else ""), end="") + return 0 + + +def minimize(args: argparse.Namespace) -> int: + child_path = Path(args.config).resolve() + base_path = Path(args.base).resolve() + + child_cfg_raw = OmegaConf.load(child_path) + if not isinstance(child_cfg_raw, DictConfig): + raise TypeError( + f"Config at {child_path} must be a mapping (DictConfig), got {type(child_cfg_raw)}" + ) + base_cfg_raw = OmegaConf.load(base_path) + if not isinstance(base_cfg_raw, DictConfig): + raise TypeError( + f"Config at {base_path} must be a mapping (DictConfig), got {type(base_cfg_raw)}" + ) + + # Resolve both before comparison + child_resolved = OmegaConf.to_container(child_cfg_raw) + base_resolved = OmegaConf.to_container(base_cfg_raw) + + if not isinstance(child_resolved, dict) or not isinstance(base_resolved, dict): + raise TypeError("Both child and base configs must be mappings after resolution") + + pruned = _prune_equal(child_resolved, base_resolved) + + # Ensure mapping output + if pruned is None or not isinstance(pruned, dict): + pruned = {} if pruned is None else {"value": pruned} + + # Ensure defaults reference base (relative path from child) + _ensure_defaults_relative(child_path, base_path, pruned) + + # Ensure `defaults` appears first in the top-level mapping + if "defaults" in pruned: + pruned = {"defaults": pruned["defaults"], **pruned} + + # Emit + text = OmegaConf.to_yaml(OmegaConf.create(pruned)) + if args.in_place: + Path(args.config).write_text(text) + else: + print(text + ("\n" if not text.endswith("\n") else ""), end="") + return 0 + + +def _flatten(d: Any, prefix: str = "") -> dict[str, Any]: + out: dict[str, Any] = {} + if isinstance(d, dict): + for k, v in d.items(): + key = f"{prefix}.{k}" if prefix else str(k) + out.update(_flatten(v, key)) + elif isinstance(d, list): + for i, v in enumerate(d): + key = f"{prefix}[{i}]" + out.update(_flatten(v, key)) + else: + out[prefix] = d + return out + + +def compare(args: argparse.Namespace) -> int: + left_path = Path(args.left).resolve() + right_path = Path(args.right).resolve() + + # Expand via repo loader, then convert to plain dict/list so _flatten works + left = OmegaConf.to_container(load_config(str(left_path))) # type: ignore[assignment] + right = OmegaConf.to_container(load_config(str(right_path))) # type: ignore[assignment] + + lf = _flatten(left) + rf = _flatten(right) + + left_keys = set(lf.keys()) + right_keys = set(rf.keys()) + + added = sorted(right_keys - left_keys) + removed = sorted(left_keys - right_keys) + common = sorted(left_keys & right_keys) + + changed: list[str] = [] + for k in common: + if lf[k] != rf[k]: + changed.append(k) + + if not added and not removed and not changed: + print("Configs are identical after expansion") + return 0 + + # Print concise report with explicit left/right context + print("Comparing configs after expansion:") + print(f" Left : {left_path}") + print(f" Right: {right_path}") + + if added: + print("\nAdded in Right (missing in Left):") + for k in added: + print(f" {k} = {rf[k]}") + + if removed: + print("\nRemoved in Right (only in Left):") + for k in removed: + print(f" {k} = {lf[k]}") + + if changed: + print("\nChanged (Left -> Right):") + for k in changed: + print(f" {k}: {lf[k]} -> {rf[k]}") + return 0 + + +def minimize_check(args: argparse.Namespace) -> int: + """Check if minimizing would change the file. Exit non-zero if so. + + Args (same as `minimize`): + base: Base config path + config: Child config path + """ + child_path = Path(args.config).resolve() + base_path = Path(args.base).resolve() + + # Compute minimized text (same as minimize()) + child_cfg_raw = OmegaConf.load(child_path) + base_cfg_raw = OmegaConf.load(base_path) + if not isinstance(child_cfg_raw, DictConfig) or not isinstance( + base_cfg_raw, DictConfig + ): + print( + f"[minimize-check] Both child and base must be mappings: {child_path} vs {base_path}", + file=sys.stderr, + ) + return 2 + + child_resolved = OmegaConf.to_container(child_cfg_raw) + base_resolved = OmegaConf.to_container(base_cfg_raw) + if not isinstance(child_resolved, dict) or not isinstance(base_resolved, dict): + print( + f"[minimize-check] Both child and base must resolve to mappings: {child_path} vs {base_path}", + file=sys.stderr, + ) + return 2 + + pruned = _prune_equal(child_resolved, base_resolved) + if pruned is None or not isinstance(pruned, dict): + pruned = {} if pruned is None else {"value": pruned} + _ensure_defaults_relative(child_path, base_path, pruned) + if "defaults" in pruned: + pruned = {"defaults": pruned["defaults"], **pruned} + minimized_text = OmegaConf.to_yaml(OmegaConf.create(pruned)) + + # Normalize current file via OmegaConf to reduce noise from formatting differences + try: + current_norm_text = OmegaConf.to_yaml(OmegaConf.load(child_path)) + except Exception: + current_norm_text = child_path.read_text() + + if current_norm_text != minimized_text: + print( + f"[minimize-check] {child_path} is not minimized.\n" + f" Suggested fix: tools/config_cli.py minimize {base_path} {child_path} --in-place", + file=sys.stderr, + ) + return 1 + + return 0 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Config tools (expand, minimize)") + sub = parser.add_subparsers(dest="cmd", required=True) + + p_expand = sub.add_parser("expand", help="Resolve a config with OmegaConf") + p_expand.add_argument("config", help="Path to config YAML") + p_expand.add_argument( + "--in-place", + action="store_true", + dest="in_place", + help="Edit file in place instead of printing", + ) + p_expand.set_defaults(func=expand) + + p_min = sub.add_parser( + "minimize", + help="Remove keys equal to base and ensure defaults reference base", + ) + p_min.add_argument("base", help="Base config path") + p_min.add_argument("config", help="Child config path") + p_min.add_argument( + "--in-place", + action="store_true", + dest="in_place", + help="Edit file in place instead of printing", + ) + p_min.set_defaults(func=minimize) + + p_cmp = sub.add_parser( + "compare", help="Compare two configs after expanding their defaults" + ) + p_cmp.add_argument("left", help="Left config path") + p_cmp.add_argument("right", help="Right config path") + p_cmp.set_defaults(func=compare) + + p_minchk = sub.add_parser( + "minimize-check", + help=( + "Exit non-zero if minimizing would change the file; args mirror `minimize`" + ), + ) + p_minchk.add_argument("base", help="Base config path") + p_minchk.add_argument("config", help="Child config path") + p_minchk.set_defaults(func=minimize_check) + + args = parser.parse_args() + ret = args.func(args) + if isinstance(ret, int): + sys.exit(ret) From 3b06882ac0fc5f68afe3a3dc2d5ae610212df8f4 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 17:06:09 +0000 Subject: [PATCH 02/15] commit Signed-off-by: Terry Kong --- .pre-commit-config.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6b55ea31e2..b09f6cceb3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,6 +47,14 @@ repos: additional_dependencies: [] minimum_pre_commit_version: "2.9.2" + # The rationale behind this pre-commit hook is that we want to ensure the config is minimized and matches + # what you want merge in early otherwise you risk running one experiment, but when you merge the config + # into upstream, you'll merge with the base config and that could be an experiment with different hyperparameters. + # Anecdotally, this has been an issue when a SFT recipe runs without a custom chat_template, but when it merges with + # the default one, it gets our recommended chat_template which is not what comes from the config. + # + # You can disable this pre-commit hook if you find this disruptive, but we will expect that the config + # is minimized before accepting the recipe upstream. - repo: local hooks: - id: configs-minimize-check-llm From 18b0e49c502cf3bd92bd77fb5b9cfa70b4c45b84 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 17:17:12 +0000 Subject: [PATCH 03/15] fix docstring to reflect it is a script Signed-off-by: Terry Kong --- tools/config_cli.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/config_cli.py b/tools/config_cli.py index 9583a55011..14010d8d43 100755 --- a/tools/config_cli.py +++ b/tools/config_cli.py @@ -19,18 +19,18 @@ Example: # Expand a config with a root level "defaults" key to see the full config; print to stdout - uv run tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml + tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml # Expand a config with a root level "defaults" key to see the full config; edit the config in place - uv run tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place + tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place # Minimize a config and remove all keys that are present in the base config; print to stdout - # uv run tools/config_cli.py minimize - uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml + # tools/config_cli.py minimize + tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml # Minimize a config and remove all keys that are present in the base config; edit the config in place - # uv run tools/config_cli.py minimize - uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place + # tools/config_cli.py minimize + tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place # Minimize all llm the configs: for algo in grpo dpo sft; do @@ -39,21 +39,21 @@ base_config=examples/configs/grpo_math_1B.yaml fi for recipe in examples/configs/recipes/llm/${algo}-*.yaml; do - uv run tools/config_cli.py minimize $base_config $recipe --in-place + tools/config_cli.py minimize $base_config $recipe --in-place done done # Minimize vlm configs: for recipe in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do - uv run tools/config_cli.py minimize examples/configs/vlm_grpo_3B.yaml $recipe --in-place + tools/config_cli.py minimize examples/configs/vlm_grpo_3B.yaml $recipe --in-place done # Compare two configs - uv run tools/config_cli.py compare examples/configs/grpo_math_1B.yaml examples/configs/grpo_math_8B.yaml + tools/config_cli.py compare examples/configs/grpo_math_1B.yaml examples/configs/grpo_math_8B.yaml # Minimize a config and compare it to not minimzing (should be the same) - uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml >examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized - uv run tools/config_cli.py compare \ + tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml >examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized + tools/config_cli.py compare \ examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml \ examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized """ From 029c68f86a3034d9332f3c1c23e6db3536f31fa8 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 17:17:21 +0000 Subject: [PATCH 04/15] minimize configs Signed-off-by: Terry Kong --- ....1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml | 79 ++------ ...po-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml | 83 ++------ ...llama3.1-8b-instruct-4n8g-megatron.v2.yaml | 102 +--------- ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml | 100 +--------- .../dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml | 48 ++--- ...llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml | 83 +------- ...truct-2407-1n8g-fsdp2tp8-actckpt-long.yaml | 98 ++-------- .../recipes/llm/grpo-deepscaler-1.5b-16K.yaml | 15 +- .../recipes/llm/grpo-deepscaler-1.5b-24K.yaml | 37 +--- .../recipes/llm/grpo-deepscaler-1.5b-8K.yaml | 134 +------------ .../llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml | 111 +---------- ...ma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml | 101 +--------- .../llm/grpo-gspo-deepscaler-1.5b-8K.yaml | 128 +----------- ...lama3.1-8b-instruct-1n8g-megatron-fp8.yaml | 141 ++------------ ...b-instruct-2n8g-fsdp2tp1-noncolocated.yaml | 110 ++--------- ...3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml | 115 ++--------- ...llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml | 112 +---------- ...po-llama3.2-1b-instruct-1n8g-megatron.yaml | 135 +------------ ...po-math-qwen3-30ba3b-megatron-tp4-32k.yaml | 160 ++------------- .../grpo-moonlight-16ba3b-4n8g-megatron.yaml | 156 ++------------- ...-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml | 110 ++--------- ...en2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml | 110 ++--------- ...wen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml | 111 ++--------- ...rpo-qwen2.5-7b-instruct-4n8g-megatron.yaml | 161 ++-------------- ...5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml | 112 +---------- .../llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml | 129 ++----------- ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml | 110 ++--------- ...lama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml | 48 +---- .../sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml | 56 +----- .../llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml | 54 +----- ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml | 91 +-------- .../llm/sft-llama3.1-8b-1n8g-megatron.yaml | 93 +-------- .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml | 64 +----- ...wen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml | 57 +----- ...3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml | 182 +----------------- ...2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml | 179 +---------------- 36 files changed, 317 insertions(+), 3398 deletions(-) diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml index 72dcb9ad1e..18a84b9cee 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml @@ -1,95 +1,44 @@ +defaults: ../../dpo.yaml dpo: max_num_epochs: 2 max_num_steps: 20 val_period: 50 val_batches: 16 val_global_batch_size: 32 - val_micro_batch_size: 1 val_at_start: false - seed: 42 - - reference_policy_kl_penalty: 0.05 - preference_average_log_probs: False - sft_average_log_probs: ${.preference_average_log_probs} - preference_loss_weight: 1 sft_loss_weight: 0.01 - checkpointing: - enabled: true - checkpoint_dir: "results/dpo" - metric_name: "val_loss" - higher_is_better: false - keep_top_k: 3 save_period: 10000 - checkpoint_must_save_by: null - policy: - model_name: "meta-llama/Llama-3.1-8B-Instruct" + model_name: meta-llama/Llama-3.1-8B-Instruct tokenizer: name: ${policy.model_name} train_global_batch_size: 256 train_micro_batch_size: 1 max_total_sequence_length: 2048 - precision: "bfloat16" dtensor_cfg: - enabled: true - cpu_offload: False - sequence_parallel: false - activation_checkpointing: false tensor_parallel_size: 2 - context_parallel_size: 1 - custom_parallel_plan: null - - dynamic_batching: - enabled: false - - sequence_packing: - enabled: false - - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 - optimizer: - name: "torch.optim.AdamW" kwargs: - lr: 5.0e-6 - weight_decay: 0.1 - betas: [0.9, 0.98] - eps: 1e-8 - foreach: False - fused: False - + eps: 1.0e-08 scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 0.000000001 - end_factor: 1.0 - total_iters: 1 - - name: "torch.optim.lr_scheduler.ConstantLR" - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: [1] - -data: - dataset_name: "HelpSteer3" - max_input_seq_length: ${policy.max_total_sequence_length} - shuffle: true - + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 1.0e-09 + end_factor: 1.0 + total_iters: 1 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1.0 + total_iters: 10000000000 + - milestones: + - 1 logger: - log_dir: "logs" wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1 - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 - cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml index 22851b368c..f18407fd59 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml @@ -1,95 +1,40 @@ +defaults: ../../dpo.yaml dpo: - max_num_epochs: 1 - max_num_steps: 150 val_period: 50 val_batches: 16 val_global_batch_size: 32 - val_micro_batch_size: 1 val_at_start: false - seed: 42 - - reference_policy_kl_penalty: 0.05 - preference_average_log_probs: False - sft_average_log_probs: ${.preference_average_log_probs} - preference_loss_weight: 1 sft_loss_weight: 0.01 - -checkpointing: - enabled: true - checkpoint_dir: "results/dpo" - metric_name: "val_loss" - higher_is_better: false - keep_top_k: 3 - save_period: 50 - checkpoint_must_save_by: null - policy: - model_name: "meta-llama/Llama-3.1-8B-Instruct" + model_name: meta-llama/Llama-3.1-8B-Instruct tokenizer: name: ${policy.model_name} train_global_batch_size: 256 train_micro_batch_size: 1 max_total_sequence_length: 8192 - precision: "bfloat16" dtensor_cfg: - enabled: true - cpu_offload: False - sequence_parallel: false - activation_checkpointing: false tensor_parallel_size: 4 - context_parallel_size: 1 - custom_parallel_plan: null - - dynamic_batching: - enabled: false - - sequence_packing: - enabled: false - - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 - optimizer: - name: "torch.optim.AdamW" kwargs: - lr: 5.0e-6 - weight_decay: 0.1 - betas: [0.9, 0.98] - eps: 1e-8 - foreach: False - fused: False - + eps: 1.0e-08 scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 0.000000001 - end_factor: 1.0 - total_iters: 1 - - name: "torch.optim.lr_scheduler.ConstantLR" - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: [1] - -data: - dataset_name: "HelpSteer3" - max_input_seq_length: ${policy.max_total_sequence_length} - shuffle: true - + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 1.0e-09 + end_factor: 1.0 + total_iters: 1 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1.0 + total_iters: 10000000000 + - milestones: + - 1 logger: - log_dir: "logs" wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4 - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 - cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml index f5f0b2e5d7..72ac01081d 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml @@ -1,128 +1,32 @@ +defaults: ../../dpo.yaml dpo: - max_num_epochs: 1 - max_num_steps: 150 val_period: 50 val_batches: 16 val_global_batch_size: 32 - val_micro_batch_size: 1 val_at_start: false - seed: 42 - - reference_policy_kl_penalty: 0.05 - preference_average_log_probs: False - sft_average_log_probs: ${.preference_average_log_probs} - preference_loss_weight: 1 sft_loss_weight: 0.01 - checkpointing: - enabled: false #true - checkpoint_dir: "results/dpo" - metric_name: "val_loss" - higher_is_better: false - keep_top_k: 3 - save_period: 50 - checkpoint_must_save_by: null - + enabled: false policy: - model_name: "meta-llama/Llama-3.1-8B-Instruct" + model_name: meta-llama/Llama-3.1-8B-Instruct tokenizer: name: ${policy.model_name} train_global_batch_size: 256 train_micro_batch_size: 1 max_total_sequence_length: 8192 - precision: "bfloat16" dtensor_cfg: enabled: false - - dynamic_batching: - enabled: false - - sequence_packing: - enabled: false - make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} - max_grad_norm: 1.0 - optimizer: null - megatron_cfg: enabled: true - empty_unused_memory_level: 1 - activation_checkpointing: false tensor_model_parallel_size: 4 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: true - freeze_moe_router: false - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - - optimizer: - optimizer: "adam" - lr: 5.0e-6 #4.0e-5 - min_lr: 5.0e-6 #4.0e-5 - weight_decay: 0.1 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_warmup_iters: 1 - lr_warmup_init: 0.00000001 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - data_parallel_sharding_strategy: "optim_grads_params" - -data: - dataset_name: "HelpSteer3" - max_input_seq_length: ${policy.max_total_sequence_length} - shuffle: true - logger: - log_dir: "logs" wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 - cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml index 9dd723ec22..78c3e80336 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml @@ -1,128 +1,34 @@ +defaults: ../../dpo.yaml dpo: - max_num_epochs: 1 max_num_steps: 20 val_period: 50 val_batches: 16 val_global_batch_size: 32 - val_micro_batch_size: 1 val_at_start: false - seed: 42 - - reference_policy_kl_penalty: 0.05 - preference_average_log_probs: False - sft_average_log_probs: ${.preference_average_log_probs} - preference_loss_weight: 1 sft_loss_weight: 0.01 - checkpointing: - enabled: false #true - checkpoint_dir: "results/dpo" - metric_name: "val_loss" - higher_is_better: false - keep_top_k: 3 + enabled: false save_period: 10000 - checkpoint_must_save_by: null - policy: - model_name: "meta-llama/Llama-3.1-8B-Instruct" + model_name: meta-llama/Llama-3.1-8B-Instruct tokenizer: name: ${policy.model_name} train_global_batch_size: 256 train_micro_batch_size: 1 max_total_sequence_length: 2048 - precision: "bfloat16" dtensor_cfg: enabled: false - - dynamic_batching: - enabled: false - - sequence_packing: - enabled: false - make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} - max_grad_norm: 1.0 - optimizer: null - megatron_cfg: enabled: true - empty_unused_memory_level: 1 - activation_checkpointing: false - tensor_model_parallel_size: 2 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 pipeline_model_parallel_size: 2 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: true - freeze_moe_router: false - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - - optimizer: - optimizer: "adam" - lr: 5.0e-6 #4.0e-5 - min_lr: 5.0e-6 #4.0e-5 - weight_decay: 0.1 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_warmup_iters: 1 - lr_warmup_init: 0.00000001 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - data_parallel_sharding_strategy: "optim_grads_params" - -data: - dataset_name: "HelpSteer3" - max_input_seq_length: ${policy.max_total_sequence_length} - shuffle: true - logger: - log_dir: "logs" wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1 - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 - cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml index c3398a6cd5..3527838c62 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml @@ -1,51 +1,43 @@ -defaults: "../../dpo.yaml" - +defaults: ../../dpo.yaml cluster: - num_nodes: 1 gpus_per_node: 8 - policy: - model_name: "allenai/Llama-3.1-Tulu-3-8B-SFT" + model_name: allenai/Llama-3.1-Tulu-3-8B-SFT tokenizer: - name: "allenai/Llama-3.1-Tulu-3-8B-SFT" + name: allenai/Llama-3.1-Tulu-3-8B-SFT train_micro_batch_size: 1 - train_global_batch_size: 128 max_total_sequence_length: 2048 optimizer: - name: "torch.optim.AdamW" kwargs: - lr: 5.0e-7 + lr: 5.0e-07 weight_decay: 0.0 scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 1.0e-6 - end_factor: 1.0 - total_iters: 211 - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 1.0 - end_factor: 0.0 - total_iters: 1899 - - milestones: [211] - + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 1.0e-06 + end_factor: 1.0 + total_iters: 211 + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 1.0 + end_factor: 0.0 + total_iters: 1899 + - milestones: + - 211 data: - dataset_name: "Tulu3Preference" - + dataset_name: Tulu3Preference dpo: max_num_steps: 2110 val_period: -1 val_at_start: false - preference_average_log_probs: True + preference_average_log_probs: true reference_policy_kl_penalty: 5 val_micro_batch_size: ${policy.train_micro_batch_size} val_global_batch_size: ${policy.train_global_batch_size} - checkpointing: metric_name: null save_period: 250 - logger: - wandb_enabled: True + wandb_enabled: true wandb: - name: "dpo-tulu3-8b" + name: dpo-tulu3-8b diff --git a/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml index 22870f0e66..252251fd76 100644 --- a/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml @@ -1,96 +1,15 @@ +defaults: ../../dpo.yaml dpo: - max_num_epochs: 1 - max_num_steps: 150 - val_period: 25 - val_batches: 8 val_global_batch_size: 32 - val_micro_batch_size: 1 val_at_start: false - seed: 42 - - reference_policy_kl_penalty: 0.05 - preference_average_log_probs: False - sft_average_log_probs: ${.preference_average_log_probs} - preference_loss_weight: 1 - sft_loss_weight: 0 - -checkpointing: - enabled: true - checkpoint_dir: "results/dpo" - metric_name: "val_loss" - higher_is_better: false - keep_top_k: 3 - save_period: 50 - checkpoint_must_save_by: null - policy: - model_name: "meta-llama/Llama-3.2-1B-Instruct" tokenizer: name: ${policy.model_name} - - train_global_batch_size: 128 - train_micro_batch_size: 2 - max_total_sequence_length: 1024 - precision: "bfloat16" - dtensor_cfg: - enabled: true - cpu_offload: False - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null - - dynamic_batching: - enabled: false - - sequence_packing: - enabled: false - - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 - - optimizer: - name: "torch.optim.AdamW" - kwargs: - lr: 5.0e-6 - weight_decay: 0.1 - betas: [0.9, 0.98] - eps: 1e-5 - foreach: False - fused: False - - scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 20 - - name: "torch.optim.lr_scheduler.ConstantLR" - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: [20] - -data: - dataset_name: "HelpSteer3" - max_input_seq_length: ${policy.max_total_sequence_length} - shuffle: true - logger: - log_dir: "logs" wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 - cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.yaml b/examples/configs/recipes/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.yaml index 86a3a6fc97..9833aa30d0 100644 --- a/examples/configs/recipes/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.yaml +++ b/examples/configs/recipes/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.yaml @@ -1,108 +1,46 @@ -# DPO Algorithm Configuration +defaults: ../../dpo.yaml dpo: - max_num_epochs: 1 max_num_steps: 100 val_period: 10 val_batches: 1 val_global_batch_size: 16 - val_micro_batch_size: 1 - val_at_start: true - seed: 42 - reference_policy_kl_penalty: 0.1 - preference_average_log_probs: False # whether normalizing log probs according to the sequence length in preference_loss - sft_average_log_probs: ${.preference_average_log_probs} # whether normalizing log probs according to the sequence length in sft_loss - - preference_loss_weight: 1 # the coefficient of the preference loss - sft_loss_weight: 0 # the coefficient of the SFT loss - checkpointing: - enabled: true - checkpoint_dir: "results/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long" - metric_name: "val_loss" - higher_is_better: false + checkpoint_dir: results/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long keep_top_k: null - save_period: 50 - checkpoint_must_save_by: null - policy: - model_name: "mistralai/Mistral-Nemo-Instruct-2407" + model_name: mistralai/Mistral-Nemo-Instruct-2407 tokenizer: name: ${policy.model_name} - - # number of preference samples per batch - # each preference sample corresponds to a pair of chosen and rejected responses - # so the actual batch size processed by the model is train_global_batch_size * 2 train_global_batch_size: 8 train_micro_batch_size: 1 - - - #logprob_batch_size: ${policy.train_micro_batch_size} max_total_sequence_length: 12288 - precision: "bfloat16" - dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false activation_checkpointing: true tensor_parallel_size: 8 - context_parallel_size: 1 - custom_parallel_plan: null clear_cache_every_n_steps: 1 env_vars: - PYTORCH_CUDA_ALLOC_CONF: "max_split_size_mb:64" - - dynamic_batching: - enabled: false - - sequence_packing: - enabled: false - - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 - + PYTORCH_CUDA_ALLOC_CONF: max_split_size_mb:64 optimizer: - name: "torch.optim.AdamW" kwargs: - lr: 1.0e-6 + lr: 1.0e-06 weight_decay: 0.01 - betas: [0.9, 0.999] - eps: 1e-8 - # when using Dtensor, we need to set foreach - # and fused to False - foreach: False - fused: False - + betas: + - 0.9 + - 0.999 + eps: 1.0e-08 scheduler: - - name: "torch.optim.lr_scheduler.ConstantLR" - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: [] - + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1.0 + total_iters: 10000000000 + - milestones: [] data: - dataset_name: "HelpSteer3" - shuffle: False - max_input_seq_length: ${policy.max_total_sequence_length} - + shuffle: false logger: - log_dir: "logs/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long" # Base directory for all logs - wandb_enabled: false # Make sure you do a ``wandb login [Your API key]'' before running - tensorboard_enabled: false - mlflow_enabled: false - monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal + log_dir: logs/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long wandb: - project: "nemo-rl" - name: "dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long" - tensorboard: {} - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - + project: nemo-rl + name: dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml index 570fecb1b9..8fc6eccbdd 100644 --- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml +++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml @@ -1,23 +1,14 @@ -# GRPO Algorithm Configuration -defaults: "grpo-deepscaler-1.5b-8K.yaml" - +defaults: +- ../../grpo_math_1B.yaml +- grpo-deepscaler-1.5b-8K.yaml loss_fn: reference_policy_kl_penalty: 0.001 ratio_clip_max: 0.28 - - policy: max_total_sequence_length: 16384 logprob_batch_size: 2 - dtensor_cfg: - enabled: true cpu_offload: true sequence_parallel: true activation_checkpointing: true tensor_parallel_size: 2 - context_parallel_size: 1 - custom_parallel_plan: null - - dynamic_batching: - enabled: False diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml index 3cd8fabd6d..2bf34c47d1 100644 --- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml +++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml @@ -1,48 +1,23 @@ -# GRPO Algorithm Configuration -defaults: "grpo-deepscaler-1.5b-8K.yaml" - +defaults: +- ../../grpo_math_1B.yaml +- grpo-deepscaler-1.5b-8K.yaml loss_fn: reference_policy_kl_penalty: 0.0001 - ratio_clip_min: 0.2 ratio_clip_max: 0.28 - policy: max_total_sequence_length: 24576 logprob_batch_size: 2 - dtensor_cfg: - enabled: true cpu_offload: true sequence_parallel: true activation_checkpointing: true tensor_parallel_size: 2 - context_parallel_size: 1 - custom_parallel_plan: null - - dynamic_batching: - enabled: False - sequence_packing: - enabled: False - + enabled: false optimizer: - name: "torch.optim.AdamW" kwargs: - lr: 5.0e-7 - + lr: 5.0e-07 generation: - backend: "vllm" - max_new_tokens: ${policy.max_total_sequence_length} - temperature: 1.0 - top_p: 1.0 - top_k: null - stop_token_ids: null - stop_strings: null vllm_cfg: - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 gpu_memory_utilization: 0.8 - enforce_eager: True - max_model_len: ${policy.max_total_sequence_length} + enforce_eager: true diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml index f6cc626890..48d3317e81 100644 --- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml +++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml @@ -1,157 +1,37 @@ -# GRPO Algorithm Configuration +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 128 num_generations_per_prompt: 8 - max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question) - max_num_epochs: 1 - max_num_steps: 1000000 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false max_val_samples: 480 val_batch_size: 32 - seed: 42 - overlong_filtering: false - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - loss_fn: reference_policy_kl_penalty: 0.0 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - # (default off) loss formulation improvements (docs/guides/grpo.md#loss) - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true - checkpointing: - enabled: true - checkpoint_dir: "results/grpo" - metric_name: "val_reward" - higher_is_better: true keep_top_k: 10 - save_period: 10 - checkpoint_must_save_by: null - policy: - model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + model_name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B train_global_batch_size: 64 train_micro_batch_size: 1 - generation_batch_size: 32 # Only used when generating using HF backend - logprob_batch_size: 4 max_total_sequence_length: 8192 - precision: "bfloat16" - dtensor_cfg: - enabled: true cpu_offload: true sequence_parallel: true activation_checkpointing: true - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null - - dynamic_batching: - enabled: False - sequence_packing: - enabled: False - - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 - + enabled: false optimizer: - name: "torch.optim.AdamW" kwargs: - lr: 2.0e-6 - weight_decay: 0.01 - betas: [0.9, 0.999] - eps: 1e-8 - # when using Dtensor, we need to set foreach - # and fused to False - foreach: False - fused: False - - scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 50 - - name: "torch.optim.lr_scheduler.ConstantLR" - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: [50] - + lr: 2.0e-06 generation: - backend: "vllm" - max_new_tokens: ${policy.max_total_sequence_length} - temperature: 1.0 - top_p: 1.0 - top_k: null - stop_token_ids: null - stop_strings: null - vllm_cfg: - async_engine: false - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 - max_model_len: ${policy.max_total_sequence_length} - enforce_eager: False vllm_kwargs: compilation_config: - # when enforce_eager is False, set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy, - # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile - # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998 - use_inductor: False - colocated: - # true: generation shares training GPUs - # false: uses dedicated generation resources - enabled: true - # only relevant when enabled is false - resources: - gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 - num_nodes: null # Decides number of nodes to be dedicated to generation - + use_inductor: false data: - max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len - prompt_file: "examples/prompts/cot.txt" - system_prompt_file: null - dataset_name: "DeepScaler" - shuffle: true - + dataset_name: DeepScaler env: math: num_workers: 16 - logger: - log_dir: "logs" # Base directory for all logs - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal - wandb_enabled: false - tensorboard_enabled: false - mlflow_enabled: false - monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard - wandb: - project: "grpo-dev" - name: "grpo-dev-logger" - tensorboard: {} - mlflow: - experiment_name: "grpo-dev" - run_name: "grpo-dev-logger" - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - + monitor_gpus: false cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml index 091cb2909a..15ca65c8f9 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml @@ -1,136 +1,29 @@ +defaults: ../../grpo_math_1B.yaml grpo: - num_prompts_per_step: 32 - num_generations_per_prompt: 16 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 500 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - overlong_filtering: false - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: - enabled: true checkpoint_dir: results/grpo-gemma3-1b-it-1n8g-fsdp2tp1 - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: google/gemma-3-1b-it tokenizer: name: google/gemma-3-1b-it - train_global_batch_size: 512 - train_micro_batch_size: 4 - generation_batch_size: 32 - logprob_batch_size: 4 - max_total_sequence_length: 512 - precision: bfloat16 - dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 + enabled: true sequence_packing: enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 1 - max_grad_norm: 1 - optimizer: - name: torch.optim.AdamW - kwargs: - lr: 5e-06 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 50 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 50 generation: - backend: vllm max_new_tokens: 512 - temperature: 1 - top_p: 1 - top_k: null - stop_token_ids: null - stop_strings: null vllm_cfg: - async_engine: false - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 512 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 512 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-gemma3-1b-it-1n8g-fsdp2tp1 - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-gemma3-1b-it-1n8g-fsdp2tp1 - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml index 4c3351970c..c50ea4834b 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml @@ -1,137 +1,40 @@ +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 64 num_generations_per_prompt: 32 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 20 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - overlong_filtering: false - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: - enabled: true checkpoint_dir: results/grpo-gemma3-27b-it-8n8g-fsdp2tp8sp-actckpt-long - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: google/gemma-3-27b-it tokenizer: name: google/gemma-3-27b-it - train_global_batch_size: 512 train_micro_batch_size: 1 - generation_batch_size: 32 logprob_batch_size: 2 max_total_sequence_length: 16384 - precision: bfloat16 dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false activation_checkpointing: true tensor_parallel_size: 8 - context_parallel_size: 1 - custom_parallel_plan: null - dynamic_batching: - # TODO: OOMs if enabled https://github.com/NVIDIA-NeMo/RL/issues/383 - enabled: False - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 sequence_packing: enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 8 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 3e-07 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 50 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 50 + lr: 3.0e-07 generation: - backend: vllm max_new_tokens: 16384 - temperature: 1 - top_p: 1 - top_k: null - stop_token_ids: null - stop_strings: null vllm_cfg: - async_engine: false - precision: ${policy.precision} tensor_parallel_size: 4 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 16384 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 16384 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-gemma3-27b-it-8n8g-fsdp2tp8sp-actckpt-long - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-gemma3-27b-it-8n8g-fsdp2tp8sp-actckpt-long - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 num_nodes: 8 diff --git a/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml index e1b7c4d809..547b4c4382 100644 --- a/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml +++ b/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml @@ -1,152 +1,38 @@ -# GRPO Algorithm Configuration +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 128 num_generations_per_prompt: 8 - max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question) - max_num_epochs: 1 - max_num_steps: 1000000 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - overlong_filtering: false max_val_samples: 480 val_batch_size: 32 - seed: 42 - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - loss_fn: reference_policy_kl_penalty: 0.0 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - # (default off) loss formulation improvements (docs/guides/grpo.md#loss) - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false sequence_level_importance_ratios: true token_level_loss: false - checkpointing: - enabled: true - checkpoint_dir: "results/grpo" - metric_name: "val_reward" - higher_is_better: true keep_top_k: 10 - save_period: 10 - checkpoint_must_save_by: null - policy: - model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default + model_name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B train_global_batch_size: 64 train_micro_batch_size: 1 - generation_batch_size: 32 # Only used when generating using HF backend - logprob_batch_size: 4 max_total_sequence_length: 8192 - precision: "bfloat16" - dtensor_cfg: - enabled: true cpu_offload: true sequence_parallel: true activation_checkpointing: true - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null - - dynamic_batching: - enabled: False - sequence_packing: - enabled: False - - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 - + enabled: false optimizer: - name: "torch.optim.AdamW" kwargs: - lr: 2.0e-6 - weight_decay: 0.01 - betas: [0.9, 0.999] - eps: 1e-8 - # when using Dtensor, we need to set foreach - # and fused to False - foreach: False - fused: False - - scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 50 - - name: "torch.optim.lr_scheduler.ConstantLR" - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: [50] - + lr: 2.0e-06 generation: - backend: "vllm" - max_new_tokens: ${policy.max_total_sequence_length} - temperature: 1.0 - top_p: 1.0 - top_k: null - stop_token_ids: null - stop_strings: null vllm_cfg: - async_engine: false - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 - max_model_len: ${policy.max_total_sequence_length} - enforce_eager: True - colocated: - # true: generation shares training GPUs - # false: uses dedicated generation resources - enabled: true - # only relevant when enabled is false - resources: - gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 - num_nodes: null # Decides number of nodes to be dedicated to generation - + enforce_eager: true data: - max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len - prompt_file: "examples/prompts/cot.txt" - system_prompt_file: null - dataset_name: "DeepScaler" - shuffle: true - + dataset_name: DeepScaler env: math: num_workers: 16 - logger: - log_dir: "logs" # Base directory for all logs - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal - wandb_enabled: false - tensorboard_enabled: false - mlflow_enabled: false - monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard - wandb: - project: "grpo-dev" - name: "grpo-dev-logger" - tensorboard: {} - mlflow: - experiment_name: "grpo-dev" - run_name: "grpo-dev-logger" - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - + monitor_gpus: false cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml index 81ca15f6bd..a61133c358 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml @@ -1,169 +1,52 @@ +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 64 num_generations_per_prompt: 32 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 500 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - overlong_filtering: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: True - token_level_loss: true + use_importance_sampling_correction: true checkpointing: - enabled: true checkpoint_dir: results/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8 - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.1-8B-Instruct tokenizer: name: meta-llama/Llama-3.1-8B-Instruct - train_global_batch_size: 512 train_micro_batch_size: 1 - generation_batch_size: 32 logprob_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 make_sequence_length_divisible_by: 1 - max_grad_norm: 1 - dtensor_cfg: - enabled: False - - dynamic_batching: - enabled: False - - sequence_packing: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 - + enabled: false megatron_cfg: - enabled: True + enabled: true empty_unused_memory_level: 1 - converter_type: "LlamaForCausalLM" - tensor_model_parallel_size: 1 + converter_type: LlamaForCausalLM pipeline_model_parallel_size: 2 - context_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - sequence_parallel: False - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - freeze_moe_router: True - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - apply_rope_fusion: True - activation_checkpointing: True - defer_fp32_logits: True - + activation_checkpointing: true + defer_fp32_logits: true optimizer: - optimizer: "adam" - lr: 5.0e-7 - min_lr: 5.0e-8 + lr: 5.0e-07 + min_lr: 5.0e-08 weight_decay: 0.0 - bf16: True - fp16: False - params_dtype: "float32" - - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - use_distributed_optimizer: True - use_precision_aware_optimizer: True - - clip_grad: ${policy.max_grad_norm} - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 lr_warmup_iters: 2 - lr_warmup_init: 5.0e-8 - - distributed_data_parallel_config: - grad_reduce_in_fp32: False - overlap_grad_reduce: True - overlap_param_gather: True - average_in_collective: True - use_custom_fsdp: False - data_parallel_sharding_strategy: "optim_grads_params" - + lr_warmup_init: 5.0e-08 generation: - backend: vllm max_new_tokens: 4096 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 128009 - stop_strings: null + - 128009 vllm_cfg: - async_engine: false - precision: 'fp8' - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 + precision: fp8 max_model_len: 4096 - enforce_eager: False use_deep_gemm: true - num_last_layers_in_bf16: 0 - num_first_layers_in_bf16: 0 - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 4096 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8 - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-llama3.1-8b-instruct-1n8g-megatron-fp8 - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml index 17b474bd72..052d082328 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml @@ -1,137 +1,57 @@ +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 64 num_generations_per_prompt: 32 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 500 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - overlong_filtering: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: - enabled: true checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.1-8B-Instruct tokenizer: name: meta-llama/Llama-3.1-8B-Instruct - train_global_batch_size: 512 train_micro_batch_size: 1 - generation_batch_size: 32 logprob_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 - dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 + enabled: true sequence_packing: enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 1 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 3e-07 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false + lr: 3.0e-07 scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 13 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 13 + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 13 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 13 generation: - backend: vllm max_new_tokens: 4096 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 128009 - stop_strings: null + - 128009 vllm_cfg: async_engine: true - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 4096 - enforce_eager: False colocated: enabled: false resources: - gpus_per_node: null num_nodes: 1 data: max_input_seq_length: 4096 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 num_nodes: 2 diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml index 1c2b3840ca..df9181f660 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml @@ -1,137 +1,52 @@ +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 64 num_generations_per_prompt: 32 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 500 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - overlong_filtering: false - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: - enabled: true checkpoint_dir: results/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.1-8B-Instruct tokenizer: name: meta-llama/Llama-3.1-8B-Instruct - train_global_batch_size: 512 train_micro_batch_size: 1 - generation_batch_size: 32 logprob_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 - dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 + enabled: true sequence_packing: enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 1 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 3e-07 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false + lr: 3.0e-07 scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 13 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 13 + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 13 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 13 generation: - backend: vllm max_new_tokens: 4096 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 128009 - stop_strings: null + - 128009 vllm_cfg: - async_engine: false - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 4096 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 4096 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml index eddf09bf97..fce039a321 100644 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml @@ -1,137 +1,31 @@ +defaults: ../../grpo_math_1B.yaml grpo: - num_prompts_per_step: 32 - num_generations_per_prompt: 16 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 500 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - overlong_filtering: false - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: - enabled: true checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.2-1B-Instruct tokenizer: name: meta-llama/Llama-3.2-1B-Instruct - train_global_batch_size: 512 - train_micro_batch_size: 4 - generation_batch_size: 32 - logprob_batch_size: 4 - max_total_sequence_length: 512 - precision: bfloat16 - dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 + enabled: true sequence_packing: enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 1 - max_grad_norm: 1 - optimizer: - name: torch.optim.AdamW - kwargs: - lr: 5e-06 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 50 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 50 generation: - backend: vllm max_new_tokens: 512 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 128009 - stop_strings: null + - 128009 vllm_cfg: - async_engine: false - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 512 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 512 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml index 4ad29901fa..48f00c626e 100755 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml @@ -1,167 +1,36 @@ +defaults: ../../grpo_math_1B.yaml grpo: - num_prompts_per_step: 32 - num_generations_per_prompt: 16 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 500 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - overlong_filtering: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: enabled: false checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 save_period: 100 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.2-1B-Instruct tokenizer: name: meta-llama/Llama-3.2-1B-Instruct - train_global_batch_size: 512 - train_micro_batch_size: 4 - generation_batch_size: 32 - logprob_batch_size: 4 - max_total_sequence_length: 512 - precision: bfloat16 optimizer: null megatron_cfg: enabled: true - empty_unused_memory_level: 0 - activation_checkpointing: false - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - - optimizer: - optimizer: "adam" - lr: 5.0e-6 - min_lr: 5.0e-7 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 lr_warmup_iters: 50 - lr_warmup_init: 5.0e-7 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - dtensor_cfg: enabled: false - dynamic_batching: - enabled: False - sequence_packing: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 1 - max_grad_norm: 1 generation: - backend: vllm max_new_tokens: 512 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 128009 - stop_strings: null + - 128009 vllm_cfg: - async_engine: false - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 512 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 512 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: False - monitor_gpus: true wandb: project: nemo-rl name: grpo-llama3.2-1b-instruct-1n8g-megatron - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml index 507b1eefd8..e2c2582194 100644 --- a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml +++ b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml @@ -1,175 +1,53 @@ +defaults: ../../grpo_math_1B.yaml checkpointing: - enabled: True checkpoint_dir: results/grpo-math-qwen3-30ba3b-megatron-tp4-32k save_period: 3 keep_top_k: 1 - metric_name: val_reward - higher_is_better: True - checkpoint_must_save_by: null - grpo: - normalize_rewards: True - use_leave_one_out_baseline: True - max_num_epochs: 1 max_num_steps: 3 num_prompts_per_step: 64 - num_generations_per_prompt: 16 - max_rollout_turns: 1 val_period: 3 - val_at_start: False - overlong_filtering: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - # (default off) loss formulation improvements (docs/guides/grpo.md#loss) - use_on_policy_kl_approximation: False - use_importance_sampling_correction: False - token_level_loss: True - ratio_clip_c: null - policy: - model_name: "Qwen/Qwen3-30B-A3B" - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default - train_global_batch_size: 512 + model_name: Qwen/Qwen3-30B-A3B train_micro_batch_size: 1 - generation_batch_size: 32 # Only used when generating using HF backend logprob_batch_size: 1 max_total_sequence_length: 32768 - precision: "bfloat16" logprob_chunk_size: 2048 - dtensor_cfg: - enabled: False - - dynamic_batching: - enabled: False - + enabled: false sequence_packing: - enabled: False - - max_grad_norm: 1.0 + enabled: false make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} - - optimizer: null # remove default FSDP optimizer - - scheduler: null # remove default FSDP scheduler - + optimizer: null + scheduler: null megatron_cfg: - enabled: True + enabled: true empty_unused_memory_level: 1 - converter_type: "LlamaForCausalLM" + converter_type: LlamaForCausalLM tensor_model_parallel_size: 4 - pipeline_model_parallel_size: 1 - context_parallel_size: 1 - expert_tensor_parallel_size: 1 expert_model_parallel_size: 8 - sequence_parallel: True - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - freeze_moe_router: True - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - apply_rope_fusion: True - activation_checkpointing: True - defer_fp32_logits: True - + sequence_parallel: true + activation_checkpointing: true + defer_fp32_logits: true optimizer: - optimizer: "adam" - lr: 5.0e-7 - min_lr: 5.0e-8 + lr: 5.0e-07 + min_lr: 5.0e-08 weight_decay: 0.0 - bf16: True - fp16: False - params_dtype: "float32" - - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - use_distributed_optimizer: True - use_precision_aware_optimizer: True - - clip_grad: ${policy.max_grad_norm} - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 lr_warmup_iters: 2 - lr_warmup_init: 5.0e-8 - - distributed_data_parallel_config: - grad_reduce_in_fp32: False - overlap_grad_reduce: True - overlap_param_gather: True - average_in_collective: True - use_custom_fsdp: False - data_parallel_sharding_strategy: "optim_grads_params" - + lr_warmup_init: 5.0e-08 generation: - backend: "vllm" - max_new_tokens: ${policy.max_total_sequence_length} - temperature: 1.0 - top_p: 1.0 - top_k: null - stop_token_ids: null - stop_strings: null vllm_cfg: - async_engine: False - precision: ${policy.precision} tensor_parallel_size: 4 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 - max_model_len: ${policy.max_total_sequence_length} - # NB(pjin): https://github.com/NVIDIA-NeMo/RL/pull/857 - enforce_eager: True - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null - -data: - dataset_name: "OpenMathInstruct-2" - shuffle: true - max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len - prompt_file: "examples/prompts/cot.txt" - system_prompt_file: null - -env: - math: - num_workers: 8 - + enforce_eager: true logger: log_dir: logs/grpo-math-qwen3-30ba3b-megatron-tp4-32k - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal - wandb_enabled: True - tensorboard_enabled: True - mlflow_enabled: False # Disable MLflow logging - monitor_gpus: False # If true, will monitor GPU usage and log to wandb and/or tensorboard + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: false wandb: project: nemo-rl - name: "grpo-math-qwen3-30ba3b-megatron-tp4-32k" - tensorboard: {} - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - + name: grpo-math-qwen3-30ba3b-megatron-tp4-32k cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml index a0784ba746..e1e38fbbfc 100644 --- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml @@ -1,174 +1,40 @@ -# GRPO Algorithm Configuration -defaults: "../../grpo_math_1B.yaml" - +defaults: ../../grpo_math_1B.yaml grpo: - num_prompts_per_step: 32 - num_generations_per_prompt: 16 - max_num_epochs: 1 - max_num_steps: 1000000 - normalize_rewards: true - use_leave_one_out_baseline: true val_period: -1 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - loss_fn: reference_policy_kl_penalty: 0.04 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - # (default off) loss formulation improvements (docs/guides/grpo.md#loss) - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true - ratio_clip_c: null - checkpointing: enabled: false - checkpoint_dir: "results/grpo_megatron" - metric_name: "val_reward" - higher_is_better: true - keep_top_k: 3 + checkpoint_dir: results/grpo_megatron save_period: 10000 - policy: - model_name: "moonshotai/Moonlight-16B-A3B-Instruct" - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default - train_global_batch_size: 512 + model_name: moonshotai/Moonlight-16B-A3B-Instruct train_micro_batch_size: 1 - generation_batch_size: 64 # Only used when generating using megatron backend + generation_batch_size: 64 logprob_batch_size: 1 max_total_sequence_length: 8192 - precision: "bfloat16" - dtensor_cfg: enabled: false - - # dynamic_batching improves performance by ensuring logprob and training microbatches - # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length - # responses are sorted by sequence length and bucketed into microbatches with a total - # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the - # training and logprob stages respectively. - dynamic_batching: - enabled: False - sequence_packing: - enabled: False # coming soon - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_ffd" - sequence_length_round: 64 - - max_grad_norm: 1.0 - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training + enabled: false + algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} - - optimizer: null # remove default FSDP optimizer - + optimizer: null megatron_cfg: enabled: true - empty_unused_memory_level: 0 - activation_checkpointing: false - converter_type: "Qwen2ForCausalLM" - tensor_model_parallel_size: 1 - expert_tensor_parallel_size: 1 expert_model_parallel_size: 4 pipeline_model_parallel_size: 4 num_layers_in_first_pipeline_stage: 7 num_layers_in_last_pipeline_stage: 6 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - # Causes logprob error divergence for moonlight - apply_rope_fusion: False - + apply_rope_fusion: false optimizer: - optimizer: "adam" - lr: 1.0e-6 - min_lr: 5.0e-7 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - + lr: 1.0e-06 scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 lr_warmup_iters: 50 - lr_warmup_init: 5.0e-7 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - - generation: - backend: "vllm" - max_new_tokens: ${policy.max_total_sequence_length} - temperature: 1.0 - top_p: 1.0 - top_k: null - vllm_cfg: - tensor_parallel_size: 1 - gpu_memory_utilization: 0.6 - max_model_len: ${policy.max_total_sequence_length} - -data: - max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len - prompt_file: "examples/prompts/cot.txt" - system_prompt_file: null - dataset_name: "OpenMathInstruct-2" - -env: - math: - num_workers: 8 - logger: - log_dir: "logs" # Base directory for all logs - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal - wandb_enabled: false - tensorboard_enabled: false - mlflow_enabled: False - monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard + monitor_gpus: false wandb: - project: "grpo-dev" - name: "grpo-moonlight-16B-A3B-Instruct" - tensorboard: {} - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - + name: grpo-moonlight-16B-A3B-Instruct cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml index 7fd4007279..b5aaf22ceb 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml @@ -1,137 +1,57 @@ +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 64 num_generations_per_prompt: 32 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 20 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - overlong_filtering: false - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: - enabled: true checkpoint_dir: results/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: Qwen/Qwen2.5-32B tokenizer: name: Qwen/Qwen2.5-32B - train_global_batch_size: 512 train_micro_batch_size: 1 - generation_batch_size: 32 logprob_batch_size: 2 max_total_sequence_length: 16384 - precision: bfloat16 dtensor_cfg: - enabled: true - cpu_offload: false sequence_parallel: true activation_checkpointing: true tensor_parallel_size: 8 - context_parallel_size: 1 - custom_parallel_plan: null dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 + enabled: true sequence_packing: enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 8 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 3e-07 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false + lr: 3.0e-07 scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 13 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 13 + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 13 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 13 generation: - backend: vllm max_new_tokens: 16384 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 151643 - stop_strings: null + - 151643 vllm_cfg: - async_engine: false - precision: ${policy.precision} tensor_parallel_size: 4 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 16384 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 16384 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 num_nodes: 32 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml index f163092404..44c2f7f8eb 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml @@ -1,137 +1,57 @@ +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 64 num_generations_per_prompt: 32 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 2 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - overlong_filtering: false - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: - enabled: true checkpoint_dir: results/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: Qwen/Qwen2.5-32B tokenizer: name: Qwen/Qwen2.5-32B - train_global_batch_size: 512 train_micro_batch_size: 1 - generation_batch_size: 32 logprob_batch_size: 2 max_total_sequence_length: 16384 - precision: bfloat16 dtensor_cfg: - enabled: true - cpu_offload: false sequence_parallel: true activation_checkpointing: true tensor_parallel_size: 8 - context_parallel_size: 1 - custom_parallel_plan: null dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 + enabled: true sequence_packing: enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 8 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 3e-07 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false + lr: 3.0e-07 scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 13 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 13 + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 13 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 13 generation: - backend: vllm max_new_tokens: 16384 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 151643 - stop_strings: null + - 151643 vllm_cfg: - async_engine: false - precision: ${policy.precision} tensor_parallel_size: 4 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 16384 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 16384 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 num_nodes: 32 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml index f6ecc1e390..98e7eadedd 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml @@ -1,137 +1,56 @@ +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 64 num_generations_per_prompt: 32 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 30 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - overlong_filtering: false - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: - enabled: true checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: Qwen/Qwen2.5-7B-Instruct tokenizer: name: Qwen/Qwen2.5-7B-Instruct - train_global_batch_size: 512 train_micro_batch_size: 1 - generation_batch_size: 32 logprob_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 dtensor_cfg: - enabled: true - cpu_offload: false sequence_parallel: true - activation_checkpointing: false tensor_parallel_size: 4 - context_parallel_size: 1 - custom_parallel_plan: null dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 + enabled: true sequence_packing: enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 4 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 3e-07 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false + lr: 3.0e-07 scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 13 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 13 + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 13 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 13 generation: - backend: vllm max_new_tokens: 4096 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 151645 - stop_strings: null + - 151645 vllm_cfg: - async_engine: false - precision: ${policy.precision} tensor_parallel_size: 4 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 4096 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 4096 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml index 1209040cda..a42ea746a7 100755 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml @@ -1,189 +1,56 @@ +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 64 num_generations_per_prompt: 32 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 30 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - overlong_filtering: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: enabled: false checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-megatron - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 save_period: 100 - checkpoint_must_save_by: null policy: model_name: Qwen/Qwen2.5-7B-Instruct - tokenizer: - name: ${policy.model_name} - train_global_batch_size: 512 train_micro_batch_size: 1 - generation_batch_size: 32 logprob_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 dtensor_cfg: enabled: false megatron_cfg: enabled: true - empty_unused_memory_level: 0 - activation_checkpointing: false - converter_type: "Qwen2ForCausalLM" tensor_model_parallel_size: 2 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 - pipeline_model_parallel_size: 1 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - - optimizer: - optimizer: "adam" - lr: 5.0e-6 - min_lr: 5.0e-7 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 lr_warmup_iters: 50 - lr_warmup_init: 5.0e-7 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - dynamic_batching: - enabled: false - sequence_packing: - enabled: true - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 4 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 3e-07 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false + lr: 3.0e-07 scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 13 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 13 + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 13 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 13 generation: - backend: vllm max_new_tokens: 4096 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 151645 - stop_strings: null + - 151645 vllm_cfg: - async_engine: false - precision: ${policy.precision} tensor_parallel_size: 4 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 4096 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 4096 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-megatron - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: False - monitor_gpus: true wandb: project: nemo-rl name: grpo-qwen2.5-7b-instruct-4n8g-megatron - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml index b8f79eb6ae..c417c00dbd 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml @@ -1,137 +1,31 @@ +defaults: ../../grpo_math_1B.yaml grpo: - num_prompts_per_step: 32 - num_generations_per_prompt: 16 - max_rollout_turns: 1 - max_num_epochs: 1 max_num_steps: 450 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - overlong_filtering: false - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true checkpointing: - enabled: true checkpoint_dir: results/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1 - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: model_name: Qwen/Qwen2.5-Math-1.5B-Instruct tokenizer: name: Qwen/Qwen2.5-Math-1.5B-Instruct - train_global_batch_size: 512 - train_micro_batch_size: 4 - generation_batch_size: 32 - logprob_batch_size: 4 - max_total_sequence_length: 512 - precision: bfloat16 - dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 + enabled: true sequence_packing: enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 1 - max_grad_norm: 1 - optimizer: - name: torch.optim.AdamW - kwargs: - lr: 5e-06 - weight_decay: 0.01 - betas: - - 0.9 - - 0.999 - eps: 1e-08 - foreach: false - fused: false - scheduler: - - name: torch.optim.lr_scheduler.LinearLR - kwargs: - start_factor: 0.1 - end_factor: 1 - total_iters: 50 - - name: torch.optim.lr_scheduler.ConstantLR - kwargs: - factor: 1 - total_iters: 10000000000 - - milestones: - - 50 generation: - backend: vllm max_new_tokens: 512 - temperature: 1 - top_p: 1 - top_k: null stop_token_ids: - - 151645 - stop_strings: null + - 151645 vllm_cfg: - async_engine: false - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 max_model_len: 512 - enforce_eager: False - colocated: - enabled: true - resources: - gpus_per_node: null - num_nodes: null data: max_input_seq_length: 512 - prompt_file: examples/prompts/cot.txt - system_prompt_file: null - dataset_name: OpenMathInstruct-2 - shuffle: true -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1 - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1 - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml index 5c7d1ed78f..de30fe287a 100755 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml @@ -1,155 +1,48 @@ -# GRPO Algorithm Configuration -defaults: "../../grpo_math_1B.yaml" - +defaults: ../../grpo_math_1B.yaml grpo: num_prompts_per_step: 64 num_generations_per_prompt: 32 - max_num_steps: 1000000 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - max_val_samples: 256 - val_batch_size: 256 -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - # (default off) loss formulation improvements (docs/guides/grpo.md#loss) - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true - ratio_clip_c: null checkpointing: enabled: false checkpoint_dir: results/grpo-qwen3-30ba3b-8n8g-megatron - metric_name: val_reward - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null policy: - model_name: "Qwen/Qwen3-30B-A3B" - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default - train_global_batch_size: 512 + model_name: Qwen/Qwen3-30B-A3B train_micro_batch_size: 1 - generation_batch_size: 32 # Only used when generating using HF backend - logprob_batch_size: 4 max_total_sequence_length: 4096 - precision: "bfloat16" - dtensor_cfg: enabled: false - - optimizer: null # remove default FSDP optimizer - - scheduler: null # remove default FSDP scheduler - - dynamic_batching: - enabled: False + optimizer: null + scheduler: null sequence_packing: - enabled: False # coming soon - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - algorithm: "modified_ffd" - sequence_length_round: 64 - max_grad_norm: 1.0 + enabled: false + algorithm: modified_ffd make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} megatron_cfg: enabled: true empty_unused_memory_level: 1 - activation_checkpointing: false tensor_model_parallel_size: 4 pipeline_model_parallel_size: 4 - context_parallel_size: 1 - expert_tensor_parallel_size: 1 expert_model_parallel_size: 4 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: True - pipeline_dtype: ${policy.precision} - freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - + sequence_parallel: true optimizer: - optimizer: "adam" - lr: 3.0e-7 - min_lr: 3.0e-8 - weight_decay: 0.01 - bf16: true - fp16: false - params_dtype: "float32" - clip_grad: ${policy.max_grad_norm} - #adam - adam_beta1: 0.9 - adam_beta2: 0.999 - adam_eps: 1e-8 - #sgd - sgd_momentum: 0.9 - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - + lr: 3.0e-07 + min_lr: 3.0e-08 scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 lr_warmup_iters: 50 - lr_warmup_init: 3.0e-8 - + lr_warmup_init: 3.0e-08 env_vars: - PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False" - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:False generation: - backend: "vllm" - max_new_tokens: ${policy.max_total_sequence_length} - temperature: 1.0 - top_p: 1.0 - top_k: null - stop_token_ids: null - stop_strings: null vllm_cfg: tensor_parallel_size: 4 gpu_memory_utilization: 0.7 - max_model_len: ${policy.max_total_sequence_length} -data: - max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len - prompt_file: "examples/prompts/cot.txt" - system_prompt_file: null - dataset_name: "OpenMathInstruct-2" -env: - math: - num_workers: 8 logger: log_dir: logs/grpo-qwen3-30ba3b-8n8g-megatron - num_val_samples_to_print: 0 wandb_enabled: true tensorboard_enabled: true - mlflow_enabled: False - monitor_gpus: true wandb: project: nemo-rl name: grpo-qwen3-30ba3b-8n8g-megatron - tensorboard: {} - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 num_nodes: 8 diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml index 33435fbd15..5ffc78b136 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml @@ -1,134 +1,60 @@ +defaults: ../../sft.yaml sft: - max_num_epochs: 1 max_num_steps: 1000000 val_period: 500 val_batches: 4 val_global_batch_size: 128 - val_micro_batch_size: 1 val_at_start: false - seed: 42 checkpointing: - enabled: true checkpoint_dir: results/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron - metric_name: val_loss - higher_is_better: false - keep_top_k: 3 save_period: 100 - checkpoint_must_save_by: null policy: - model_name: "meta-llama/Llama-3.1-70B" + model_name: meta-llama/Llama-3.1-70B tokenizer: - name: meta-llama/Llama-3.1-8B-Instruct ## specify if you'd like to use a tokenizer different from the model's default + name: meta-llama/Llama-3.1-8B-Instruct train_global_batch_size: 512 - train_micro_batch_size: 1 max_total_sequence_length: 4096 - precision: "bfloat16" dtensor_cfg: enabled: false megatron_cfg: enabled: true - empty_unused_memory_level: 1 - activation_checkpointing: false tensor_model_parallel_size: 4 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 pipeline_model_parallel_size: 2 - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - sequence_parallel: false freeze_moe_router: true - moe_router_dtype: "fp64" - moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo - moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - + moe_router_dtype: fp64 + moe_router_load_balancing_type: none + moe_router_bias_update_rate: 0.0 optimizer: - optimizer: "adam" - lr: 2e-5 - min_lr: 2e-5 + lr: 2.0e-05 + min_lr: 2.0e-05 weight_decay: 0.01 bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 adam_beta2: 0.999 - adam_eps: 1e-8 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - + adam_eps: 1.0e-08 clip_grad: 0.0 - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 lr_warmup_iters: 1 - lr_warmup_init: 2e-5 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - use_custom_fsdp: false - data_parallel_sharding_strategy: "optim_grads_params" - dynamic_batching: - enabled: false - sequence_packing: - enabled: false - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training + lr_warmup_init: 2.0e-05 make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} max_grad_norm: null optimizer: - name: "torch.optim.AdamW" kwargs: - lr: 2e-5 + lr: 2.0e-05 weight_decay: 0.01 - betas: [0.9, 0.98] - eps: 1e-8 - # when using Dtensor, we need to set foreach - # and fused to False - foreach: False - fused: False + eps: 1.0e-08 data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: "openmathinstruct2" + dataset_name: openmathinstruct2 prompt_file: examples/prompts/math.txt - split: "train_1M" - add_bos: true - add_eos: true + split: train_1M add_generation_prompt: true - output_key: 'generated_solution' - shuffle: true + output_key: generated_solution seed: 42 logger: - log_dir: "logs" # Base directory for all logs - wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running - tensorboard_enabled: true - mlflow_enabled: False - monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard + monitor_gpus: false wandb: - project: "sft-dev" - name: "openmathinstruct-nemorl-1M_train" + name: openmathinstruct-nemorl-1M_train tensorboard: - log_dir: "tb_logs-openmathinstruct-nemorl-1M_train" - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) + log_dir: tb_logs-openmathinstruct-nemorl-1M_train cluster: gpus_per_node: 8 num_nodes: 8 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml index d7906b82e0..1a7e4e1994 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml @@ -1,20 +1,14 @@ +defaults: ../../sft.yaml sft: - max_num_epochs: 1 max_num_steps: 10000 val_period: 500 val_batches: 4 val_global_batch_size: 128 val_micro_batch_size: 2 val_at_start: false - seed: 42 checkpointing: - enabled: true checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long - metric_name: val_loss - higher_is_better: false - keep_top_k: 3 save_period: 50 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.1-8B tokenizer: @@ -22,59 +16,29 @@ policy: train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false - activation_checkpointing: false tensor_parallel_size: 4 - context_parallel_size: 1 - custom_parallel_plan: null dynamic_batching: enabled: true - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - sequence_length_round: 64 - sequence_packing: - enabled: false make_sequence_length_divisible_by: 1 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 2e-5 + lr: 2.0e-05 weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - eps: 1e-08 - foreach: false - fused: false + eps: 1.0e-08 data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: "openmathinstruct2" + dataset_name: openmathinstruct2 prompt_file: examples/prompts/math.txt - split: "train_1M" - add_bos: true - add_eos: true + split: train_1M add_generation_prompt: true - output_key: 'generated_solution' + output_key: generated_solution seed: 42 - shuffle: true logger: log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long - wandb_enabled: true - tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long tensorboard: log_dir: tb_logs-sft-dev-squad - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml index 1fc0ccec7c..dc4a671fec 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml @@ -1,20 +1,14 @@ +defaults: ../../sft.yaml sft: - max_num_epochs: 1 max_num_steps: 10000 val_period: 500 val_batches: 4 val_global_batch_size: 128 val_micro_batch_size: 2 val_at_start: false - seed: 42 checkpointing: - enabled: true checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long - metric_name: val_loss - higher_is_better: false - keep_top_k: 3 save_period: 100 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.1-8B tokenizer: @@ -22,63 +16,25 @@ policy: train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 - dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null - dynamic_batching: - enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - sequence_length_round: 64 - sequence_packing: - enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 1 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 2e-5 + lr: 2.0e-05 weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - eps: 1e-08 - foreach: false - fused: false + eps: 1.0e-08 data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: "openmathinstruct2" + dataset_name: openmathinstruct2 prompt_file: examples/prompts/math.txt - split: "train_1M" - add_bos: true - add_eos: true + split: train_1M add_generation_prompt: true - output_key: 'generated_solution' - shuffle: true + output_key: generated_solution seed: 42 logger: log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long - wandb_enabled: true - tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long tensorboard: log_dir: tb_logs-sft-dev-squad - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 - diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml index 8c3f14b531..f4c0296977 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml @@ -1,20 +1,10 @@ +defaults: ../../sft.yaml sft: - max_num_epochs: 1 max_num_steps: 350 val_period: 500 - val_batches: 8 - val_global_batch_size: 32 - val_micro_batch_size: 1 - val_at_start: true - seed: 42 checkpointing: - enabled: true checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp - metric_name: val_loss - higher_is_better: false - keep_top_k: 3 save_period: 20 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.1-8B tokenizer: @@ -22,60 +12,28 @@ policy: train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 dtensor_cfg: - enabled: true - cpu_offload: false sequence_parallel: true - activation_checkpointing: false tensor_parallel_size: 2 - context_parallel_size: 1 - custom_parallel_plan: null - dynamic_batching: - enabled: false - sequence_packing: - enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 2 - max_grad_norm: 1 optimizer: - name: torch.optim.AdamW kwargs: - lr: 2e-5 + lr: 2.0e-05 weight_decay: 0.01 - betas: - - 0.9 - - 0.98 - eps: 1e-08 - foreach: false - fused: false + eps: 1.0e-08 data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: "openmathinstruct2" + dataset_name: openmathinstruct2 prompt_file: examples/prompts/math.txt - split: "train_1M" - add_bos: true - add_eos: true + split: train_1M add_generation_prompt: true - output_key: 'generated_solution' - shuffle: true + output_key: generated_solution seed: 42 logger: log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp - wandb_enabled: true - tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp tensorboard: log_dir: tb_logs-sft-dev-openmathinstruct2 - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml index 0bb610fff3..43b351cd34 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml @@ -1,20 +1,10 @@ +defaults: ../../sft.yaml sft: - max_num_epochs: 1 max_num_steps: 250 val_period: 500 - val_batches: 8 - val_global_batch_size: 32 - val_micro_batch_size: 1 - val_at_start: true - seed: 42 checkpointing: - enabled: true checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-megatron - metric_name: val_loss - higher_is_better: false - keep_top_k: 3 save_period: 50 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.1-8B tokenizer: @@ -22,105 +12,36 @@ policy: train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 dtensor_cfg: enabled: false - dynamic_batching: - enabled: false sequence_packing: enabled: true - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} - max_grad_norm: 1 optimizer: null megatron_cfg: enabled: true - empty_unused_memory_level: 1 - activation_checkpointing: false tensor_model_parallel_size: 2 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 pipeline_model_parallel_size: 2 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: false - freeze_moe_router: false - moe_router_dtype: null - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - optimizer: - optimizer: "adam" - lr: 2.0e-5 - min_lr: 1.99999e-5 + lr: 2.0e-05 + min_lr: 1.99999e-05 weight_decay: 0.01 bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-5 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 50 lr_warmup_init: 1.9999e-65 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - data_parallel_sharding_strategy: "optim_grads_params" - - data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: "openmathinstruct2" + dataset_name: openmathinstruct2 prompt_file: examples/prompts/math.txt - split: "train_1M" - add_bos: true - add_eos: true + split: train_1M add_generation_prompt: true - output_key: 'generated_solution' + output_key: generated_solution seed: 42 - shuffle: true logger: log_dir: logs/sft-llama3.1-8b-1n8g-megatron - wandb_enabled: true - tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: sft-llama3.1-8b-1n8g-megatron tensorboard: log_dir: tb_logs-sft-dev-openmathinstruct2 - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml index 648f45ab12..e68a1e9792 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml @@ -1,20 +1,10 @@ +defaults: ../../sft.yaml sft: - max_num_epochs: 1 max_num_steps: 250 val_period: 500 - val_batches: 8 - val_global_batch_size: 32 - val_micro_batch_size: 1 - val_at_start: true - seed: 42 checkpointing: - enabled: true checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-megatron - metric_name: val_loss - higher_is_better: false - keep_top_k: 3 save_period: 100 - checkpoint_must_save_by: null policy: model_name: meta-llama/Llama-3.1-8B tokenizer: @@ -22,105 +12,34 @@ policy: train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 - precision: bfloat16 dtensor_cfg: enabled: false - dynamic_batching: - enabled: false - sequence_packing: - enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size} - max_grad_norm: 1 optimizer: null megatron_cfg: enabled: true - empty_unused_memory_level: 1 - activation_checkpointing: false tensor_model_parallel_size: 2 - expert_tensor_parallel_size: 1 - expert_model_parallel_size: 1 pipeline_model_parallel_size: 2 - context_parallel_size: 1 - pipeline_dtype: ${policy.precision} - num_layers_in_first_pipeline_stage: null - num_layers_in_last_pipeline_stage: null - sequence_parallel: false - freeze_moe_router: false - moe_router_dtype: null - moe_router_load_balancing_type: "aux_loss" - moe_router_bias_update_rate: 1e-3 - moe_permute_fusion: false - #gives ~20% training perf speedup with sequence packing - apply_rope_fusion: True - optimizer: - optimizer: "adam" - lr: 2.0e-5 - min_lr: 1.99999e-5 + lr: 2.0e-05 + min_lr: 1.99999e-05 weight_decay: 0.01 bf16: true - fp16: false - params_dtype: "float32" - - #adam - adam_beta1: 0.9 - adam_beta2: 0.98 - adam_eps: 1e-5 - - #sgd - sgd_momentum: 0.9 - - #distributed optimizer - use_distributed_optimizer: true - use_precision_aware_optimizer: true - - clip_grad: ${policy.max_grad_norm} - scheduler: - start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay} - weight_decay_incr_style: "constant" - lr_decay_style: "constant" - lr_decay_iters: 1000 - lr_warmup_iters: 50 lr_warmup_init: 1.9999e-65 - - distributed_data_parallel_config: - grad_reduce_in_fp32: false - overlap_grad_reduce: true - overlap_param_gather: true - average_in_collective: true - data_parallel_sharding_strategy: "optim_grads_params" - - data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: "openmathinstruct2" + dataset_name: openmathinstruct2 prompt_file: examples/prompts/math.txt - split: "train_1M" - add_bos: true - add_eos: true + split: train_1M add_generation_prompt: true - output_key: 'generated_solution' - shuffle: true + output_key: generated_solution seed: 42 logger: log_dir: logs/sft-llama3.1-8b-1n8g-megatron - wandb_enabled: true - tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: sft-llama3.1-8b-1n8g-megatron tensorboard: log_dir: tb_logs-sft-dev-openmathinstruct2 - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml index 165e2fa9a3..77ff8aac89 100644 --- a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml +++ b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml @@ -1,82 +1,26 @@ +defaults: ../../sft.yaml sft: - max_num_epochs: 1 max_num_steps: 500 - val_period: 10 - val_batches: 8 - val_global_batch_size: 32 - val_micro_batch_size: 1 - val_at_start: true - seed: 42 checkpointing: - enabled: true checkpoint_dir: results/sft-llama3.2-1b-1n8g-fsdp2tp1 - metric_name: val_loss - higher_is_better: false - keep_top_k: 3 save_period: 100 - checkpoint_must_save_by: null policy: - model_name: meta-llama/Llama-3.2-1B tokenizer: name: meta-llama/Llama-3.2-1B - chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' - train_global_batch_size: 32 - train_micro_batch_size: 1 - max_total_sequence_length: 1024 - precision: bfloat16 - dtensor_cfg: - enabled: true - cpu_offload: false - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null - dynamic_batching: - enabled: false - sequence_packing: - enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 1 - max_grad_norm: 1 - optimizer: - name: torch.optim.AdamW - kwargs: - lr: 5e-06 - weight_decay: 0.1 - betas: - - 0.9 - - 0.98 - eps: 1e-05 - foreach: false - fused: false data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: "openmathinstruct2" + dataset_name: openmathinstruct2 prompt_file: examples/prompts/math.txt - split: "train_1M" - add_bos: true - add_eos: true + split: train_1M add_generation_prompt: true - output_key: 'generated_solution' - shuffle: true + output_key: generated_solution seed: 42 logger: log_dir: logs/sft-llama3.2-1b-1n8g-fsdp2tp1 - wandb_enabled: true - tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: sft-llama3.2-1b-1n8g-fsdp2tp1 tensorboard: log_dir: tb_logs-sft-dev-openmathinstruct2 - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 - num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml index 800d94711e..c94683c61f 100644 --- a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml +++ b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml @@ -1,81 +1,32 @@ +defaults: ../../sft.yaml sft: - max_num_epochs: 1 max_num_steps: 20 - val_period: 10 - val_batches: 8 - val_global_batch_size: 32 - val_micro_batch_size: 1 - val_at_start: true - seed: 42 checkpointing: - enabled: true checkpoint_dir: results/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt - metric_name: val_loss - higher_is_better: false - keep_top_k: 3 save_period: 100 - checkpoint_must_save_by: null policy: model_name: Qwen/Qwen2.5-32B tokenizer: name: Qwen/Qwen2.5-32B - chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' - train_global_batch_size: 32 - train_micro_batch_size: 1 max_total_sequence_length: 16000 - precision: bfloat16 dtensor_cfg: - enabled: true - cpu_offload: false sequence_parallel: true activation_checkpointing: true tensor_parallel_size: 8 - context_parallel_size: 1 - custom_parallel_plan: null - dynamic_batching: - enabled: false - sequence_packing: - enabled: false - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - algorithm: "modified_first_fit_decreasing" - sequence_length_round: 64 make_sequence_length_divisible_by: 8 - max_grad_norm: 1 - optimizer: - name: torch.optim.AdamW - kwargs: - lr: 5e-06 - weight_decay: 0.1 - betas: - - 0.9 - - 0.98 - eps: 1e-05 - foreach: false - fused: false data: - max_input_seq_length: ${policy.max_total_sequence_length} - dataset_name: "openmathinstruct2" + dataset_name: openmathinstruct2 prompt_file: examples/prompts/math.txt - split: "train_1M" - add_bos: true - add_eos: true + split: train_1M add_generation_prompt: true - output_key: 'generated_solution' - shuffle: true + output_key: generated_solution logger: log_dir: logs/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt - wandb_enabled: true - tensorboard_enabled: true - mlflow_enabled: false - monitor_gpus: true wandb: project: nemo-rl name: sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt tensorboard: log_dir: tb_logs-sft-dev-openmathinstruct2 - gpu_monitoring: - collection_interval: 10 - flush_interval: 10 cluster: gpus_per_node: 8 num_nodes: 4 diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml index 3f744e1a30..2d39d9cd7f 100644 --- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml +++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml @@ -1,180 +1,14 @@ -# GRPO Algorithm Configuration -grpo: - num_prompts_per_step: 8 - num_generations_per_prompt: 16 - max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question) - max_num_epochs: 1 - max_num_steps: 1000000 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - overlong_filtering: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - # (default off) loss formulation improvements (docs/guides/grpo.md#loss) - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true - +defaults: ../../vlm_grpo_3B.yaml checkpointing: - enabled: true - checkpoint_dir: "results/clevr_grpo" - metric_name: "val_reward" - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null - + checkpoint_dir: results/clevr_grpo policy: - model_name: "Qwen/Qwen2.5-VL-3B-Instruct" - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default - train_global_batch_size: 128 - train_micro_batch_size: 1 - generation_batch_size: 32 # Only used when generating using HF backend - logprob_batch_size: 4 max_total_sequence_length: 3072 - precision: "bfloat16" - - dtensor_cfg: - _v2: true - enabled: true - cpu_offload: False - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null - - # dynamic_batching improves performance by ensuring logprob and training microbatches - # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length - # responses are sorted by sequence length and bucketed into microbatches with a total - # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the - # training and logprob stages respectively. - dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 - - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 - - sequence_packing: - enabled: False - - optimizer: - name: "torch.optim.AdamW" - kwargs: - lr: 5.0e-7 - weight_decay: 0.01 - betas: [0.9, 0.999] - eps: 1e-8 - # when using Dtensor, we need to set foreach - # and fused to False - foreach: False - fused: False - - scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 50 - - name: "torch.optim.lr_scheduler.ConstantLR" - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: [50] - - generation: - backend: "vllm" - # max_new_tokens: ${policy.max_total_sequence_length} - max_new_tokens: 1024 - temperature: 1.0 - top_p: 1.0 - top_k: null - stop_token_ids: null - stop_strings: null - vllm_cfg: - async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447. - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 - max_model_len: ${policy.max_total_sequence_length} - enforce_eager: False - colocated: - # true: generation shares training GPUs - # false: uses dedicated generation resources - enabled: true - # only relevant when enabled is false - resources: - gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 - num_nodes: null # Decides number of nodes to be dedicated to generation - -data: - max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len - prompt_file: "examples/prompts/clevr_cogent_cot.txt" - system_prompt_file: null - dataset_name: "clevr-cogent" - split: "trainA" - shuffle: true - env: - clevr-cogent: - num_workers: 8 - reward_functions: - - name: format - weight: 0.2 - - name: exact_alnum - weight: 0.8 - geometry3k: - num_workers: 8 - reward_functions: - - name: format - weight: 0.1 - - name: math_expr - weight: 0.9 refcoco: - num_workers: 8 reward_functions: - - name: format - weight: 0.1 - - name: bbox_giou - weight: 0.9 - kwargs: - giou_penalty_thres: 1.0 # (apply giou penalty if iou < giou_penalty_thres; anything less than 0 means use iou only (since the condition iou < 0 is not possible)) - -logger: - log_dir: "logs" # Base directory for all logs - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal - wandb_enabled: false - tensorboard_enabled: true - mlflow_enabled: false # Disable MLflow logging - monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard - wandb: - project: "grpo-dev" - name: "grpo-dev-logger" - tensorboard: {} - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - -cluster: - gpus_per_node: 2 - num_nodes: 1 + - name: format + weight: 0.1 + - name: bbox_giou + weight: 0.9 + kwargs: + giou_penalty_thres: 1.0 diff --git a/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml index 66feabee46..15ef079582 100644 --- a/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml +++ b/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml @@ -1,179 +1,6 @@ -# GRPO Algorithm Configuration -grpo: - num_prompts_per_step: 8 - num_generations_per_prompt: 16 - max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question) - max_num_epochs: 1 - max_num_steps: 1000000 - normalize_rewards: true - use_leave_one_out_baseline: true - val_period: 10 - val_at_start: false - overlong_filtering: false - max_val_samples: 256 - val_batch_size: 256 - seed: 42 - async_grpo: - enabled: false - max_trajectory_age_steps: 1 - -loss_fn: - reference_policy_kl_penalty: 0.01 - ratio_clip_min: 0.2 - ratio_clip_max: 0.2 - ratio_clip_c: null - # (default off) loss formulation improvements (docs/guides/grpo.md#loss) - use_on_policy_kl_approximation: false - use_importance_sampling_correction: false - token_level_loss: true - +defaults: ../../vlm_grpo_3B.yaml checkpointing: - enabled: true - checkpoint_dir: "results/clevr_grpo" - metric_name: "val_reward" - higher_is_better: true - keep_top_k: 3 - save_period: 10 - checkpoint_must_save_by: null - + checkpoint_dir: results/clevr_grpo policy: - model_name: "HuggingFaceTB/SmolVLM2-2.2B-Instruct" - tokenizer: - name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default - train_global_batch_size: 128 - train_micro_batch_size: 1 - generation_batch_size: 32 # Only used when generating using HF backend - logprob_batch_size: 4 + model_name: HuggingFaceTB/SmolVLM2-2.2B-Instruct max_total_sequence_length: 3072 - precision: "bfloat16" - - dtensor_cfg: - enabled: true - cpu_offload: False - sequence_parallel: false - activation_checkpointing: false - tensor_parallel_size: 1 - context_parallel_size: 1 - custom_parallel_plan: null - - # dynamic_batching improves performance by ensuring logprob and training microbatches - # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length - # responses are sorted by sequence length and bucketed into microbatches with a total - # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the - # training and logprob stages respectively. - dynamic_batching: - enabled: True - train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} - logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} - sequence_length_round: 64 - - # makes the training sequence length divisible by the tensor parallel size - # this is useful for sequence parallel training - make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} - max_grad_norm: 1.0 - - sequence_packing: - enabled: False - - optimizer: - name: "torch.optim.AdamW" - kwargs: - lr: 5.0e-7 - weight_decay: 0.01 - betas: [0.9, 0.999] - eps: 1e-8 - # when using Dtensor, we need to set foreach - # and fused to False - foreach: False - fused: False - - scheduler: - - name: "torch.optim.lr_scheduler.LinearLR" - kwargs: - start_factor: 0.1 - end_factor: 1.0 - total_iters: 50 - - name: "torch.optim.lr_scheduler.ConstantLR" - kwargs: - factor: 1.0 - total_iters: 10000000000 - - milestones: [50] - - generation: - backend: "vllm" - # max_new_tokens: ${policy.max_total_sequence_length} - max_new_tokens: 1024 - temperature: 1.0 - top_p: 1.0 - top_k: null - stop_token_ids: null - stop_strings: null - vllm_cfg: - async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447. - precision: ${policy.precision} - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - expert_parallel_size: 1 - gpu_memory_utilization: 0.6 - max_model_len: ${policy.max_total_sequence_length} - enforce_eager: False - colocated: - # true: generation shares training GPUs - # false: uses dedicated generation resources - enabled: true - # only relevant when enabled is false - resources: - gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 - num_nodes: null # Decides number of nodes to be dedicated to generation - -data: - max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len - prompt_file: "examples/prompts/clevr_cogent_cot.txt" - system_prompt_file: null - dataset_name: "clevr-cogent" - split: "trainA" - shuffle: true - -env: - clevr-cogent: - num_workers: 8 - reward_functions: - - name: format - weight: 0.2 - - name: exact_alnum - weight: 0.8 - geometry3k: - num_workers: 8 - reward_functions: - - name: format - weight: 0.1 - - name: math_expr - weight: 0.9 - refcoco: - num_workers: 8 - reward_functions: - - name: format - weight: 0.1 - - name: bbox_giou - weight: 0.9 - kwargs: - giou_penalty_thres: 0.5 - -logger: - log_dir: "logs" # Base directory for all logs - num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal - wandb_enabled: false - tensorboard_enabled: true - mlflow_enabled: false # Disable MLflow logging - monitor_gpus: false # If true, will monitor GPU usage and log to wandb and/or tensorboard - wandb: - project: "grpo-dev" - name: "grpo-dev-logger" - tensorboard: {} - gpu_monitoring: - collection_interval: 10 # How often to collect GPU usage metrics (in seconds) - flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds) - -cluster: - gpus_per_node: 2 - num_nodes: 1 From 6d52f03faa474f7072bd1c65d661882912b69033 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 17:21:04 +0000 Subject: [PATCH 05/15] sft default chat template Signed-off-by: Terry Kong --- .../recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml | 1 + .../recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml | 1 + .../configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml | 1 + .../configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml | 1 + .../recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml | 1 + examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml | 1 + 6 files changed, 6 insertions(+) diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml index 5ffc78b136..37e3bff33c 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml @@ -12,6 +12,7 @@ policy: model_name: meta-llama/Llama-3.1-70B tokenizer: name: meta-llama/Llama-3.1-8B-Instruct + chat_template: default train_global_batch_size: 512 max_total_sequence_length: 4096 dtensor_cfg: diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml index 1a7e4e1994..88d446283d 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml @@ -13,6 +13,7 @@ policy: model_name: meta-llama/Llama-3.1-8B tokenizer: name: meta-llama/Llama-3.1-8B-Instruct + chat_template: default train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml index dc4a671fec..86db9da5e0 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml @@ -13,6 +13,7 @@ policy: model_name: meta-llama/Llama-3.1-8B tokenizer: name: meta-llama/Llama-3.1-8B-Instruct + chat_template: default train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml index f4c0296977..d78e0d421a 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml @@ -9,6 +9,7 @@ policy: model_name: meta-llama/Llama-3.1-8B tokenizer: name: meta-llama/Llama-3.1-8B-Instruct + chat_template: default train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml index 43b351cd34..5deed14cb4 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml @@ -9,6 +9,7 @@ policy: model_name: meta-llama/Llama-3.1-8B tokenizer: name: meta-llama/Llama-3.1-8B-Instruct + chat_template: default train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml index e68a1e9792..daf5cd5393 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml @@ -9,6 +9,7 @@ policy: model_name: meta-llama/Llama-3.1-8B tokenizer: name: meta-llama/Llama-3.1-8B-Instruct + chat_template: default train_global_batch_size: 512 train_micro_batch_size: 2 max_total_sequence_length: 4096 From 79d221b9c6bd017abd81aa8c04de9bf305ab8a24 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 18:07:52 +0000 Subject: [PATCH 06/15] v2 select configs Signed-off-by: Terry Kong --- examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml | 5 +++-- examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml | 5 +++-- examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml | 1 + .../llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml | 1 + .../configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml | 1 + 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml index 8fc6eccbdd..ff37a2db01 100644 --- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml +++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml @@ -1,6 +1,6 @@ defaults: -- ../../grpo_math_1B.yaml -- grpo-deepscaler-1.5b-8K.yaml + - ../../grpo_math_1B.yaml + - grpo-deepscaler-1.5b-8K.yaml loss_fn: reference_policy_kl_penalty: 0.001 ratio_clip_max: 0.28 @@ -12,3 +12,4 @@ policy: sequence_parallel: true activation_checkpointing: true tensor_parallel_size: 2 + _v2: false diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml index 2bf34c47d1..eec67f1340 100644 --- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml +++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml @@ -1,6 +1,6 @@ defaults: -- ../../grpo_math_1B.yaml -- grpo-deepscaler-1.5b-8K.yaml + - ../../grpo_math_1B.yaml + - grpo-deepscaler-1.5b-8K.yaml loss_fn: reference_policy_kl_penalty: 0.0001 ratio_clip_max: 0.28 @@ -12,6 +12,7 @@ policy: sequence_parallel: true activation_checkpointing: true tensor_parallel_size: 2 + _v2: false sequence_packing: enabled: false optimizer: diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml index 48d3317e81..46193f04da 100644 --- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml +++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml @@ -17,6 +17,7 @@ policy: cpu_offload: true sequence_parallel: true activation_checkpointing: true + _v2: false sequence_packing: enabled: false optimizer: diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml index c50ea4834b..8b0157d2d3 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml @@ -15,6 +15,7 @@ policy: dtensor_cfg: activation_checkpointing: true tensor_parallel_size: 8 + _v2: false sequence_packing: enabled: false make_sequence_length_divisible_by: 8 diff --git a/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml index 547b4c4382..0f410c436a 100644 --- a/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml +++ b/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml @@ -19,6 +19,7 @@ policy: cpu_offload: true sequence_parallel: true activation_checkpointing: true + _v2: false sequence_packing: enabled: false optimizer: From a844f62b008ebb3ff2f036e341590e3c38d7b754 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 18:10:14 +0000 Subject: [PATCH 07/15] copyright Signed-off-by: Terry Kong --- tools/config_cli.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/config_cli.py b/tools/config_cli.py index 14010d8d43..8656de96db 100755 --- a/tools/config_cli.py +++ b/tools/config_cli.py @@ -4,6 +4,19 @@ # "omegaconf" # ] # /// +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Utilities for working with YAML configs in this repo. Subcommands: From 3f3016df439dbe94fba1294233dbf4c093e0fe4a Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 11:36:15 -0700 Subject: [PATCH 08/15] Update examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Terry Kong --- .../llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml index 78c3e80336..8b3a43ea28 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml @@ -28,7 +28,7 @@ logger: tensorboard_enabled: true wandb: project: nemo-rl - name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1 + name: dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick cluster: gpus_per_node: 8 num_nodes: 4 From d67058d6e4cfcb71ab0e036836756ef045aa6af0 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 11:36:38 -0700 Subject: [PATCH 09/15] Update examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Terry Kong --- .../recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml index 72ac01081d..8df4bc3fb0 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml @@ -26,7 +26,7 @@ logger: tensorboard_enabled: true wandb: project: nemo-rl - name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron + name: dpo-llama3.1-8b-instruct-4n8g-megatron-tp4.v2 cluster: gpus_per_node: 8 num_nodes: 4 From fb5ced87ae4c651ffa9d7892c627f2f7e6140919 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 11:38:03 -0700 Subject: [PATCH 10/15] Update examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Terry Kong --- .../configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml index 15ca65c8f9..bc3b7fcb3d 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml @@ -26,4 +26,5 @@ logger: project: nemo-rl name: grpo-gemma3-1b-it-1n8g-fsdp2tp1 cluster: + num_nodes: 1 gpus_per_node: 8 From 661f48ae3203bca580155a9c0b2394e652fc1146 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 18:40:22 +0000 Subject: [PATCH 11/15] revert Signed-off-by: Terry Kong --- .../configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml index bc3b7fcb3d..15ca65c8f9 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml @@ -26,5 +26,4 @@ logger: project: nemo-rl name: grpo-gemma3-1b-it-1n8g-fsdp2tp1 cluster: - num_nodes: 1 gpus_per_node: 8 From a236134b9a12d839e8a502e2c238ac23a486e913 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 24 Sep 2025 14:33:43 -0700 Subject: [PATCH 12/15] Update tools/config_cli.py Co-authored-by: Yi-Fu Wu Signed-off-by: Terry Kong --- tools/config_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/config_cli.py b/tools/config_cli.py index 8656de96db..9defc451eb 100755 --- a/tools/config_cli.py +++ b/tools/config_cli.py @@ -28,7 +28,7 @@ - minimize-check: Same args as `minimize` but only checks if minimization would change the file; exits non-zero if changes are needed. -Both commands support printing to stdout or in-place editing of the config file. +The `expand` and `minimize` commands support printing to stdout or in-place editing of the config file. Example: # Expand a config with a root level "defaults" key to see the full config; print to stdout From 8b59d232b88f5630b66aa43652fe4b26e7e7b6d5 Mon Sep 17 00:00:00 2001 From: Yi-Fu Wu Date: Wed, 24 Sep 2025 15:17:48 -0700 Subject: [PATCH 13/15] Update .pre-commit-config.yaml Co-authored-by: Terry Kong Signed-off-by: Yi-Fu Wu --- .pre-commit-config.yaml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b09f6cceb3..cc02d93e42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,14 +47,16 @@ repos: additional_dependencies: [] minimum_pre_commit_version: "2.9.2" - # The rationale behind this pre-commit hook is that we want to ensure the config is minimized and matches - # what you want merge in early otherwise you risk running one experiment, but when you merge the config - # into upstream, you'll merge with the base config and that could be an experiment with different hyperparameters. - # Anecdotally, this has been an issue when a SFT recipe runs without a custom chat_template, but when it merges with - # the default one, it gets our recommended chat_template which is not what comes from the config. + # This pre-commit hook ensures that the config file is minimized and reflects exactly what you + # intend to merge. Without it, you might run experiments with one config, but when merging upstream, + # the config could silently fall back to the base defaults—resulting in different hyperparameters. # - # You can disable this pre-commit hook if you find this disruptive, but we will expect that the config - # is minimized before accepting the recipe upstream. + # For example, we’ve seen cases where an SFT recipe runs without a custom chat_template. When merged, + # it unexpectedly picks up the default recommended chat_template from upstream, which doesn’t match + # the original experiment setup. + # + # If this check is disruptive, you can disable the pre-commit hook locally. However, before a recipe + # is accepted upstream, we expect the config to be minimized. - repo: local hooks: - id: configs-minimize-check-llm From 044385cfe327626840d710eb51864591652ba031 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 25 Sep 2025 06:50:08 +0000 Subject: [PATCH 14/15] unit tests Signed-off-by: Terry Kong --- tests/unit/tools/test_config_cli.py | 267 ++++++++++++++++++++++++++++ tools/config_cli.py | 126 +++++++++---- 2 files changed, 361 insertions(+), 32 deletions(-) create mode 100644 tests/unit/tools/test_config_cli.py diff --git a/tests/unit/tools/test_config_cli.py b/tests/unit/tools/test_config_cli.py new file mode 100644 index 0000000000..805b5a2a5a --- /dev/null +++ b/tests/unit/tools/test_config_cli.py @@ -0,0 +1,267 @@ +import importlib.util +import inspect +import os +from pathlib import Path +from textwrap import dedent +from typing import Any + +import pytest +from omegaconf import OmegaConf + + +def _load_cli_module() -> Any: + # Use a path relative to this test file to import tools/config_cli.py + test_file = Path(__file__).resolve() + repo_root = test_file.parents[3] + cli_path = repo_root / "tools" / "config_cli.py" + assert cli_path.exists(), f"Expected CLI at {cli_path}" + spec = importlib.util.spec_from_file_location("config_cli", str(cli_path)) + assert spec and spec.loader + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) # type: ignore[arg-type] + return module + + +@pytest.fixture(scope="module") +def cli() -> Any: + return _load_cli_module() + + +def test__resolve_path_absolute_and_relative(cli: Any, tmp_path: Path) -> None: + base = tmp_path + # absolute input stays absolute + abs_in = "/etc/hosts" + assert str(cli.resolve_path(base, abs_in)) == abs_in + # relative input resolves against base + rel_in = "sub/dir/file.yaml" + expected = (base / rel_in).resolve() + assert cli.resolve_path(base, rel_in) == expected + + +def test__prune_equal_basic(cli: Any) -> None: + # Dict pruning: remove keys equal to base, keep differences + a = {"a": 1, "b": {"c": 2, "d": 3}} + b = {"a": 1, "b": {"c": 9, "d": 3}} + out = cli._prune_equal(a, b) + assert out == {"b": {"c": 2}} + + # List pruning: equal lists of same length return REMOVE sentinel + a_list = [1, 2, 3] + b_list = [1, 2, 3] + out_list = cli._prune_equal(a_list, b_list) + assert out_list is cli.REMOVE + + # Base-type equality returns REMOVE + assert cli._prune_equal(5, 5) is cli.REMOVE + # Different base-types keep original + assert cli._prune_equal(5, 6) == 5 + + +def test__ensure_defaults_relative_variants(cli: Any, tmp_path: Path) -> None: + base = tmp_path / "configs" / "base.yaml" + child = tmp_path / "recipes" / "child.yaml" + child.parent.mkdir(parents=True, exist_ok=True) + base.parent.mkdir(parents=True, exist_ok=True) + base.write_text("base: true\n") + child.write_text("child: true\n") + + # Case 1: no defaults in child + cfg: dict[str, Any] = {"child": True} + cli._ensure_defaults_relative(child, base, cfg) + rel = os.path.relpath(str(base), start=str(child.parent)) + assert cfg["defaults"] == rel + + # Case 2: defaults as string (ensure base inserted first if missing) + cfg2: dict[str, Any] = {"defaults": "something.yaml"} + cli._ensure_defaults_relative(child, base, cfg2) + val = cfg2["defaults"] + if isinstance(val, list): + assert val[0] == rel + else: + # collapsed to a string only if single element + assert val == rel or val == "something.yaml" + + # Case 3: defaults list, ensure base is present and order preserved otherwise + cfg3: dict[str, Any] = {"defaults": ["x.yaml", "y.yaml"]} + cli._ensure_defaults_relative(child, base, cfg3) + assert isinstance(cfg3["defaults"], list) + assert cfg3["defaults"][0] == rel + + +def test_minimize_in_place_and_check( + cli: Any, tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + base = tmp_path / "base.yaml" + child = tmp_path / "child.yaml" + base.write_text( + dedent( + """ + common: + a: 1 + list: [1, 2] + nested: + x: 0 + top_only: 7 + """ + ).strip() + ) + child.write_text( + dedent( + """ + defaults: parent.yaml + common: + a: 1 + list: [1, 2] + nested: + x: 1 + new_top: 42 + """ + ).strip() + ) + + # Before minimizing, check should fail + ns = type("NS", (), {"base": str(base), "config": str(child)}) + ret = cli.minimize_check(ns) + assert ret == 1 + err = capsys.readouterr().err + assert "Suggested fix" in err + + # Minimize in place + ns2 = type("NS", (), {"base": str(base), "config": str(child), "in_place": True}) + ret2 = cli.minimize(ns2) + assert ret2 == 0 + minimized = child.read_text().strip() + rel = os.path.relpath(str(base), start=str(child.parent)) + assert minimized.splitlines()[0].startswith("defaults:") + assert rel in minimized + # Ensure pruned keys are gone and differences stay + assert "top_only" not in minimized + assert "new_top" in minimized + assert "nested:\n x: 1" in minimized.replace( + "\r\n", "\n" + ) or "nested:\n x: 1" in minimized.replace("\r\n", "\n") + + # After minimizing, check should pass + ret3 = cli.minimize_check(ns) + assert ret3 == 0 + + +def test_expand_and_compare( + cli: Any, tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + parent = tmp_path / "parent.yaml" + child = tmp_path / "child.yaml" + parent.write_text( + dedent( + """ + base_value: 10 + block: + a: 1 + b: 2 + """ + ).strip() + ) + child.write_text( + dedent( + """ + defaults: parent.yaml + base_value: 11 + block: + b: 3 + c: 4 + """ + ).strip() + ) + + # expand should merge without resolving interpolations; capture stdout + ns = type("NS", (), {"config": str(child), "in_place": False}) + ret = cli.expand(ns) + assert ret == 0 + out = capsys.readouterr().out + # Expect merged keys present + assert "base_value: 11" in out + assert "a: 1" in out and "b: 3" in out and "c: 4" in out + + # compare identical files prints identical message + ns_cmp = type("NS", (), {"left": str(child), "right": str(child)}) + ret_cmp = cli.compare(ns_cmp) + assert ret_cmp == 0 + out_cmp = capsys.readouterr().out + assert "Configs are identical" in out_cmp + + # compare different files prints sections: changed + alt = tmp_path / "alt.yaml" + alt.write_text( + dedent( + """ + defaults: parent.yaml + base_value: 12 + block: + a: 9 + b: 3 + d: 5 + """ + ).strip() + ) + ns_cmp2 = type("NS", (), {"left": str(child), "right": str(alt)}) + ret_cmp2 = cli.compare(ns_cmp2) + assert ret_cmp2 == 0 + out_cmp2 = capsys.readouterr().out + assert "Comparing configs" in out_cmp2 + assert "Added in Right" in out_cmp2 + assert "Changed (Left -> Right)" in out_cmp2 + + +def test_vendored_loader_behavior_matches_upstream(tmp_path: Path) -> None: + # Prepare simple parent/child config files + parent = tmp_path / "parent.yaml" + child = tmp_path / "child.yaml" + parent.write_text( + dedent( + """ + base: 1 + block: + a: 2 + b: 3 + """ + ).strip() + ) + child.write_text( + dedent( + """ + defaults: parent.yaml + base: 9 + block: + b: 7 + c: 4 + """ + ).strip() + ) + + # Use text-level expansion comparison by importing both implementations + # Vendored + cli = _load_cli_module() + vendored_cfg = cli.load_config_with_inheritance(str(child)) + vendored = OmegaConf.to_container(vendored_cfg) + + # Upstream via direct import; if it fails, the test should fail + import nemo_rl.utils.config as upstream + + upstream_cfg = upstream.load_config_with_inheritance(str(child)) + upstream_out = OmegaConf.to_container(upstream_cfg) + + assert vendored == upstream_out + + +def test_vendored_loader_drift_against_upstream_source() -> None: + # Enforce exact copy-paste: the vendored function's source must match upstream exactly + cli = _load_cli_module() + vendored_fn = cli.load_config_with_inheritance + + import nemo_rl.utils.config as upstream + + upstream_fn = upstream.load_config_with_inheritance + + up_src = inspect.getsource(upstream_fn).strip() + ven_src = inspect.getsource(vendored_fn).strip() + assert up_src == ven_src diff --git a/tools/config_cli.py b/tools/config_cli.py index 9defc451eb..38500cb02a 100755 --- a/tools/config_cli.py +++ b/tools/config_cli.py @@ -79,7 +79,7 @@ # VENDORED SECTION: Minimal self-contained config loader (no nemo_rl dependency) # # Original source: `nemo_rl/utils/config.py` -# - Functions adapted: `_resolve_path`, `load_config_with_inheritance`, `load_config` +# - Functions adapted: `resolve_path`, `load_config_with_inheritance`, `load_config` # - Purpose: avoid importing from nemo_rl so this script is standalone # - If upstream changes, consider updating this vendored block accordingly # ============================================================================ @@ -88,58 +88,120 @@ from omegaconf import DictConfig, ListConfig, OmegaConf -def _resolve_path(base_path: Path, path: str) -> Path: +def resolve_path(base_path: Path, path: str) -> Path: + """Resolve a path relative to the base path.""" if path.startswith("/"): return Path(path) - return (base_path / path).resolve() + return base_path / path def load_config_with_inheritance( - config_path: Union[str, Path], base_dir: Optional[Union[str, Path]] = None + config_path: Union[str, Path], + base_dir: Optional[Union[str, Path]] = None, ) -> DictConfig: - """Load a YAML config and resolve simple inheritance via a top-level `defaults` key. + """Load a config file with inheritance support. - Supports: - - `defaults: parent.yaml` (string) - - `defaults: [parent1.yaml, parent2.yaml]` (list) - - Nested inheritance via parent files with their own `defaults`. + Args: + config_path: Path to the config file + base_dir: Base directory for resolving relative paths. If None, uses config_path's directory + + Returns: + Merged config dictionary """ - config_path = Path(config_path).resolve() + config_path = Path(config_path) if base_dir is None: base_dir = config_path.parent base_dir = Path(base_dir) - cfg = OmegaConf.load(config_path) - if not isinstance(cfg, DictConfig): - raise TypeError( - f"Config at {config_path} must be a mapping (DictConfig), got {type(cfg)}" - ) + config = OmegaConf.load(config_path) + assert isinstance(config, DictConfig), ( + "Config must be a Dictionary Config (List Config not supported)" + ) - if "defaults" in cfg: - defaults = cfg.pop("defaults") + # Handle inheritance + if "defaults" in config: + defaults = config.pop("defaults") if isinstance(defaults, (str, Path)): - defaults_list = [str(defaults)] + defaults = [defaults] elif isinstance(defaults, ListConfig): - defaults_list = [str(d) for d in defaults] - elif isinstance(defaults, list): - defaults_list = [str(d) for d in defaults] - else: - raise TypeError( - f"Unsupported type for defaults: {type(defaults)} in {config_path}" - ) + defaults = [str(d) for d in defaults] - merged: DictConfig = OmegaConf.create({}) # type: ignore[assignment] - for default_entry in defaults_list: - parent_path = _resolve_path(base_dir, str(default_entry)) - parent_cfg = load_config_with_inheritance(parent_path, base_dir) - merged = cast(DictConfig, OmegaConf.merge(merged, parent_cfg)) + # Load and merge all parent configs + base_config = OmegaConf.create({}) + for default in defaults: + parent_path = resolve_path(base_dir, str(default)) + parent_config = load_config_with_inheritance(parent_path, base_dir) + base_config = cast(DictConfig, OmegaConf.merge(base_config, parent_config)) - cfg = cast(DictConfig, OmegaConf.merge(merged, cfg)) + # Merge with current config + config = cast(DictConfig, OmegaConf.merge(base_config, config)) - return cfg + return config def load_config(config_path: Union[str, Path]) -> DictConfig: + """Load a config file with inheritance support and convert it to an OmegaConf object. + + The config inheritance system supports: + + 1. Single inheritance: + ```yaml + # child.yaml + defaults: parent.yaml + common: + value: 43 + ``` + + 2. Multiple inheritance: + ```yaml + # child.yaml + defaults: + - parent1.yaml + - parent2.yaml + common: + value: 44 + ``` + + 3. Nested inheritance: + ```yaml + # parent.yaml + defaults: grandparent.yaml + common: + value: 43 + + # child.yaml + defaults: parent.yaml + common: + value: 44 + ``` + + 4. Variable interpolation: + ```yaml + # parent.yaml + base_value: 42 + derived: + value: ${base_value} + + # child.yaml + defaults: parent.yaml + base_value: 43 # This will update both base_value and derived.value + ``` + + The system handles: + - Relative and absolute paths + - Multiple inheritance + - Nested inheritance + - Variable interpolation + + The inheritance is resolved depth-first, with later configs overriding earlier ones. + This means in multiple inheritance, the last config in the list takes precedence. + + Args: + config_path: Path to the config file + + Returns: + Merged config dictionary + """ return load_config_with_inheritance(config_path) From 20ee149506a50c37b018940831617672abf87dd9 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 25 Sep 2025 06:51:11 +0000 Subject: [PATCH 15/15] copyright Signed-off-by: Terry Kong --- tests/unit/tools/test_config_cli.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/unit/tools/test_config_cli.py b/tests/unit/tools/test_config_cli.py index 805b5a2a5a..63af6c8294 100644 --- a/tests/unit/tools/test_config_cli.py +++ b/tests/unit/tools/test_config_cli.py @@ -1,3 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import importlib.util import inspect import os