From 8398b856ee38cfc0bebd01a51d2f3ceff8b7585a Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 29 Aug 2025 06:26:21 +0000
Subject: [PATCH 01/15] feat: add config_tools.py and refactor configs

Signed-off-by: Terry Kong <terryk@nvidia.com>

compare command

Signed-off-by: Terry Kong <terryk@nvidia.com>

config changes

Signed-off-by: Terry Kong <terryk@nvidia.com>

Revert "config changes"

This reverts commit 25b87e2c603c56bedbabf3d8898bfe6ec6aa2ae9.

Signed-off-by: Terry Kong <terryk@nvidia.com>

cleanup

Signed-off-by: Terry Kong <terryk@nvidia.com>

vlm example

Signed-off-by: Terry Kong <terryk@nvidia.com>

minimize configs

Signed-off-by: Terry Kong <terryk@nvidia.com>

Revert "minimize configs"

This reverts commit 137548006f003c7380756117db9732209eafd02b.

Signed-off-by: Terry Kong <terryk@nvidia.com>

minimize configs

Signed-off-by: Terry Kong <terryk@nvidia.com>

Revert "minimize configs"

This reverts commit a4cd8a4870964d0a3a180805980ecdd943bd70a5.

Signed-off-by: Terry Kong <terryk@nvidia.com>

minimize configs

Signed-off-by: Terry Kong <terryk@nvidia.com>

force sft configs to use default chat template to match last releases
behavior

Signed-off-by: Terry Kong <terryk@nvidia.com>

reverting select configs to v1 to address

Signed-off-by: Terry Kong <terryk@nvidia.com>

add pre-commit and add a minimize-check func

Signed-off-by: Terry Kong <terryk@nvidia.com>

Revert "reverting select configs to v1 to address"

This reverts commit d81f806f0c5e6a7370321b6bbae9a87448bc7317.

Signed-off-by: Terry Kong <terryk@nvidia.com>

Revert "force sft configs to use default chat template to match last releases"

This reverts commit be01df7f69f89667b0face375bad561021d6b180.

Signed-off-by: Terry Kong <terryk@nvidia.com>

Revert "minimize configs"

This reverts commit e54f144eef3e6dbe451154a2bbcbcf39099ea3a9.

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .pre-commit-config.yaml |  25 +++
 tools/config_cli.py     | 450 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 475 insertions(+)
 create mode 100755 tools/config_cli.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7d1a05182d..6b55ea31e2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -46,3 +46,28 @@ repos:
         require_serial: true
         additional_dependencies: []
         minimum_pre_commit_version: "2.9.2"
+
+  - repo: local
+    hooks:
+      - id: configs-minimize-check-llm
+        name: minimize-check llm recipes
+        language: system
+        pass_filenames: false
+        entry: bash
+        args:
+          - -lc
+          - |
+            set -euo pipefail
+            base="examples/configs/dpo.yaml"; for f in examples/configs/recipes/llm/dpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
+            base="examples/configs/grpo_math_1B.yaml"; for f in examples/configs/recipes/llm/grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
+            base="examples/configs/sft.yaml"; for f in examples/configs/recipes/llm/sft-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
+      - id: configs-minimize-check-vlm
+        name: minimize-check vlm recipes
+        language: system
+        pass_filenames: false
+        entry: bash
+        args:
+          - -lc
+          - |
+            set -euo pipefail
+            base="examples/configs/vlm_grpo_3B.yaml"; for f in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
diff --git a/tools/config_cli.py b/tools/config_cli.py
new file mode 100755
index 0000000000..9583a55011
--- /dev/null
+++ b/tools/config_cli.py
@@ -0,0 +1,450 @@
+#!/usr/bin/env -S uv run --script -q
+# /// script
+# dependencies = [
+#   "omegaconf"
+# ]
+# ///
+"""Utilities for working with YAML configs in this repo.
+
+Subcommands:
+  - expand: Resolve a config with OmegaConf interpolation and inheritance.
+  - minimize: Given a base config and a config, remove keys in the config that
+    are equal to the base, and ensure a defaults entry pointing to the base
+    exists. The defaults path in the resulting config is written relative to
+    the base config file.
+  - minimize-check: Same args as `minimize` but only checks if minimization
+    would change the file; exits non-zero if changes are needed.
+
+Both commands support printing to stdout or in-place editing of the config file.
+
+Example:
+  # Expand a config with a root level "defaults" key to see the full config; print to stdout
+  uv run tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
+
+  # Expand a config with a root level "defaults" key to see the full config; edit the config in place
+  uv run tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place
+
+  # Minimize a config and remove all keys that are present in the base config; print to stdout
+  # uv run tools/config_cli.py minimize <base_config> <config>
+  uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
+
+  # Minimize a config and remove all keys that are present in the base config; edit the config in place
+  # uv run tools/config_cli.py minimize <base_config> <config>
+  uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place
+
+  # Minimize all llm the configs:
+  for algo in grpo dpo sft; do
+    base_config=examples/configs/${algo}.yaml
+    if [[ ${algo} == grpo ]]; then
+      base_config=examples/configs/grpo_math_1B.yaml
+    fi
+    for recipe in examples/configs/recipes/llm/${algo}-*.yaml; do
+      uv run tools/config_cli.py minimize $base_config $recipe --in-place
+    done
+  done
+
+  # Minimize vlm configs:
+  for recipe in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do
+    uv run tools/config_cli.py minimize examples/configs/vlm_grpo_3B.yaml $recipe --in-place
+  done
+
+  # Compare two configs
+  uv run tools/config_cli.py compare examples/configs/grpo_math_1B.yaml examples/configs/grpo_math_8B.yaml
+
+  # Minimize a config and compare it to not minimzing (should be the same)
+  uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml >examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized
+  uv run tools/config_cli.py compare \
+    examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml \
+    examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+# ============================================================================
+# VENDORED SECTION: Minimal self-contained config loader (no nemo_rl dependency)
+#
+# Original source: `nemo_rl/utils/config.py`
+#   - Functions adapted: `_resolve_path`, `load_config_with_inheritance`, `load_config`
+#   - Purpose: avoid importing from nemo_rl so this script is standalone
+#   - If upstream changes, consider updating this vendored block accordingly
+# ============================================================================
+from typing import Any, Iterable, Optional, Union, cast
+
+from omegaconf import DictConfig, ListConfig, OmegaConf
+
+
+def _resolve_path(base_path: Path, path: str) -> Path:
+    if path.startswith("/"):
+        return Path(path)
+    return (base_path / path).resolve()
+
+
+def load_config_with_inheritance(
+    config_path: Union[str, Path], base_dir: Optional[Union[str, Path]] = None
+) -> DictConfig:
+    """Load a YAML config and resolve simple inheritance via a top-level `defaults` key.
+
+    Supports:
+    - `defaults: parent.yaml` (string)
+    - `defaults: [parent1.yaml, parent2.yaml]` (list)
+    - Nested inheritance via parent files with their own `defaults`.
+    """
+    config_path = Path(config_path).resolve()
+    if base_dir is None:
+        base_dir = config_path.parent
+    base_dir = Path(base_dir)
+
+    cfg = OmegaConf.load(config_path)
+    if not isinstance(cfg, DictConfig):
+        raise TypeError(
+            f"Config at {config_path} must be a mapping (DictConfig), got {type(cfg)}"
+        )
+
+    if "defaults" in cfg:
+        defaults = cfg.pop("defaults")
+        if isinstance(defaults, (str, Path)):
+            defaults_list = [str(defaults)]
+        elif isinstance(defaults, ListConfig):
+            defaults_list = [str(d) for d in defaults]
+        elif isinstance(defaults, list):
+            defaults_list = [str(d) for d in defaults]
+        else:
+            raise TypeError(
+                f"Unsupported type for defaults: {type(defaults)} in {config_path}"
+            )
+
+        merged: DictConfig = OmegaConf.create({})  # type: ignore[assignment]
+        for default_entry in defaults_list:
+            parent_path = _resolve_path(base_dir, str(default_entry))
+            parent_cfg = load_config_with_inheritance(parent_path, base_dir)
+            merged = cast(DictConfig, OmegaConf.merge(merged, parent_cfg))
+
+        cfg = cast(DictConfig, OmegaConf.merge(merged, cfg))
+
+    return cfg
+
+
+def load_config(config_path: Union[str, Path]) -> DictConfig:
+    return load_config_with_inheritance(config_path)
+
+
+# ============================================================================
+# END VENDORED SECTION
+# ============================================================================
+
+
+def _dict_like(obj: Any) -> bool:
+    return isinstance(obj, dict)
+
+
+def _list_like(obj: Any) -> bool:
+    return isinstance(obj, list)
+
+
+REMOVE = object()
+
+
+def _prune_equal(a: Any, b: Any) -> Any:
+    """Return a copy of `a` with entries equal to `b` removed.
+
+    - If both are dicts: recursively prune and drop keys whose subtree is empty
+      after pruning or equal.
+    - If both are lists of same length: recursively prune by index and drop list
+      if becomes entirely empty or equal.
+    - Else: if equal, return a sentinel indicating removal; otherwise return `a`.
+    """
+    if _dict_like(a) and _dict_like(b):
+        out: dict[str, Any] = {}
+        a_dict: dict[str, Any] = a  # type: ignore[assignment]
+        b_dict: dict[str, Any] = b  # type: ignore[assignment]
+        for key, a_val in a_dict.items():
+            if key in b_dict:
+                pruned = _prune_equal(a_val, b_dict[key])
+                if pruned is REMOVE:
+                    # equal, skip
+                    continue
+                # keep if subtree has content
+                if pruned != {} and pruned != []:
+                    out[key] = pruned
+            else:
+                out[key] = a_val
+        return out
+
+    if _list_like(a) and _list_like(b) and len(a) == len(b):
+        # Only remove if entire list equals base; avoid partial list pruning
+        # to prevent semantic changes in ordered config sections.
+        if a == b:
+            return REMOVE
+        return a
+
+    # Base types
+    if a == b:
+        return REMOVE
+    return a
+
+
+def _ensure_defaults_relative(
+    child_path: Path, base_path: Path, child_cfg: dict[str, Any]
+) -> None:
+    """Ensure `defaults:` points to the base, with a path relative to the base config file.
+
+    The path we store must be a string such that, when the resulting minimized
+    config sits at `child_path`, the `defaults` string references the base
+    config location. The instruction asks that the defaults path in the resulting
+    config is relative to the base config; we interpret this as "express `base`
+    relative to the directory of the base file", then make that path relative
+    to the child config so that hydra resolution works from the child file.
+    """
+    # Compute a relative reference from child dir to base file
+    import os
+
+    rel_from_child_to_base = os.path.relpath(
+        str(base_path), start=str(child_path.parent)
+    )
+
+    existing = child_cfg.get("defaults")
+    if existing is None:
+        child_cfg["defaults"] = str(rel_from_child_to_base)
+        return
+    # Normalize various forms: string, single list element, list
+    if isinstance(existing, str):
+        existing_list: list[Any] = [existing]
+    else:
+        existing_list = list(existing) if isinstance(existing, Iterable) else [existing]
+    # Put our base at the first position if not present
+    if str(rel_from_child_to_base) not in [str(x) for x in existing_list]:
+        existing_list.insert(0, str(rel_from_child_to_base))
+    # If it's a single element list, collapse to string for this repo's style
+    if len(existing_list) == 1:
+        child_cfg["defaults"] = existing_list[0]
+    else:
+        child_cfg["defaults"] = existing_list
+
+
+def expand(args: argparse.Namespace) -> int:
+    # Merge defaults/inheritance using repo loader; preserve ${...}
+    cfg = load_config(str(Path(args.config).resolve()))
+    # Preserve ${...} by not resolving
+    text = OmegaConf.to_yaml(cfg)
+    if args.in_place:
+        Path(args.config).write_text(text)
+    else:
+        print(text + ("\n" if not text.endswith("\n") else ""), end="")
+    return 0
+
+
+def minimize(args: argparse.Namespace) -> int:
+    child_path = Path(args.config).resolve()
+    base_path = Path(args.base).resolve()
+
+    child_cfg_raw = OmegaConf.load(child_path)
+    if not isinstance(child_cfg_raw, DictConfig):
+        raise TypeError(
+            f"Config at {child_path} must be a mapping (DictConfig), got {type(child_cfg_raw)}"
+        )
+    base_cfg_raw = OmegaConf.load(base_path)
+    if not isinstance(base_cfg_raw, DictConfig):
+        raise TypeError(
+            f"Config at {base_path} must be a mapping (DictConfig), got {type(base_cfg_raw)}"
+        )
+
+    # Resolve both before comparison
+    child_resolved = OmegaConf.to_container(child_cfg_raw)
+    base_resolved = OmegaConf.to_container(base_cfg_raw)
+
+    if not isinstance(child_resolved, dict) or not isinstance(base_resolved, dict):
+        raise TypeError("Both child and base configs must be mappings after resolution")
+
+    pruned = _prune_equal(child_resolved, base_resolved)
+
+    # Ensure mapping output
+    if pruned is None or not isinstance(pruned, dict):
+        pruned = {} if pruned is None else {"value": pruned}
+
+    # Ensure defaults reference base (relative path from child)
+    _ensure_defaults_relative(child_path, base_path, pruned)
+
+    # Ensure `defaults` appears first in the top-level mapping
+    if "defaults" in pruned:
+        pruned = {"defaults": pruned["defaults"], **pruned}
+
+    # Emit
+    text = OmegaConf.to_yaml(OmegaConf.create(pruned))
+    if args.in_place:
+        Path(args.config).write_text(text)
+    else:
+        print(text + ("\n" if not text.endswith("\n") else ""), end="")
+    return 0
+
+
+def _flatten(d: Any, prefix: str = "") -> dict[str, Any]:
+    out: dict[str, Any] = {}
+    if isinstance(d, dict):
+        for k, v in d.items():
+            key = f"{prefix}.{k}" if prefix else str(k)
+            out.update(_flatten(v, key))
+    elif isinstance(d, list):
+        for i, v in enumerate(d):
+            key = f"{prefix}[{i}]"
+            out.update(_flatten(v, key))
+    else:
+        out[prefix] = d
+    return out
+
+
+def compare(args: argparse.Namespace) -> int:
+    left_path = Path(args.left).resolve()
+    right_path = Path(args.right).resolve()
+
+    # Expand via repo loader, then convert to plain dict/list so _flatten works
+    left = OmegaConf.to_container(load_config(str(left_path)))  # type: ignore[assignment]
+    right = OmegaConf.to_container(load_config(str(right_path)))  # type: ignore[assignment]
+
+    lf = _flatten(left)
+    rf = _flatten(right)
+
+    left_keys = set(lf.keys())
+    right_keys = set(rf.keys())
+
+    added = sorted(right_keys - left_keys)
+    removed = sorted(left_keys - right_keys)
+    common = sorted(left_keys & right_keys)
+
+    changed: list[str] = []
+    for k in common:
+        if lf[k] != rf[k]:
+            changed.append(k)
+
+    if not added and not removed and not changed:
+        print("Configs are identical after expansion")
+        return 0
+
+    # Print concise report with explicit left/right context
+    print("Comparing configs after expansion:")
+    print(f"  Left : {left_path}")
+    print(f"  Right: {right_path}")
+
+    if added:
+        print("\nAdded in Right (missing in Left):")
+        for k in added:
+            print(f"  {k} = {rf[k]}")
+
+    if removed:
+        print("\nRemoved in Right (only in Left):")
+        for k in removed:
+            print(f"  {k} = {lf[k]}")
+
+    if changed:
+        print("\nChanged (Left -> Right):")
+        for k in changed:
+            print(f"  {k}: {lf[k]} -> {rf[k]}")
+    return 0
+
+
+def minimize_check(args: argparse.Namespace) -> int:
+    """Check if minimizing would change the file. Exit non-zero if so.
+
+    Args (same as `minimize`):
+      base: Base config path
+      config: Child config path
+    """
+    child_path = Path(args.config).resolve()
+    base_path = Path(args.base).resolve()
+
+    # Compute minimized text (same as minimize())
+    child_cfg_raw = OmegaConf.load(child_path)
+    base_cfg_raw = OmegaConf.load(base_path)
+    if not isinstance(child_cfg_raw, DictConfig) or not isinstance(
+        base_cfg_raw, DictConfig
+    ):
+        print(
+            f"[minimize-check] Both child and base must be mappings: {child_path} vs {base_path}",
+            file=sys.stderr,
+        )
+        return 2
+
+    child_resolved = OmegaConf.to_container(child_cfg_raw)
+    base_resolved = OmegaConf.to_container(base_cfg_raw)
+    if not isinstance(child_resolved, dict) or not isinstance(base_resolved, dict):
+        print(
+            f"[minimize-check] Both child and base must resolve to mappings: {child_path} vs {base_path}",
+            file=sys.stderr,
+        )
+        return 2
+
+    pruned = _prune_equal(child_resolved, base_resolved)
+    if pruned is None or not isinstance(pruned, dict):
+        pruned = {} if pruned is None else {"value": pruned}
+    _ensure_defaults_relative(child_path, base_path, pruned)
+    if "defaults" in pruned:
+        pruned = {"defaults": pruned["defaults"], **pruned}
+    minimized_text = OmegaConf.to_yaml(OmegaConf.create(pruned))
+
+    # Normalize current file via OmegaConf to reduce noise from formatting differences
+    try:
+        current_norm_text = OmegaConf.to_yaml(OmegaConf.load(child_path))
+    except Exception:
+        current_norm_text = child_path.read_text()
+
+    if current_norm_text != minimized_text:
+        print(
+            f"[minimize-check] {child_path} is not minimized.\n"
+            f"  Suggested fix: tools/config_cli.py minimize {base_path} {child_path} --in-place",
+            file=sys.stderr,
+        )
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Config tools (expand, minimize)")
+    sub = parser.add_subparsers(dest="cmd", required=True)
+
+    p_expand = sub.add_parser("expand", help="Resolve a config with OmegaConf")
+    p_expand.add_argument("config", help="Path to config YAML")
+    p_expand.add_argument(
+        "--in-place",
+        action="store_true",
+        dest="in_place",
+        help="Edit file in place instead of printing",
+    )
+    p_expand.set_defaults(func=expand)
+
+    p_min = sub.add_parser(
+        "minimize",
+        help="Remove keys equal to base and ensure defaults reference base",
+    )
+    p_min.add_argument("base", help="Base config path")
+    p_min.add_argument("config", help="Child config path")
+    p_min.add_argument(
+        "--in-place",
+        action="store_true",
+        dest="in_place",
+        help="Edit file in place instead of printing",
+    )
+    p_min.set_defaults(func=minimize)
+
+    p_cmp = sub.add_parser(
+        "compare", help="Compare two configs after expanding their defaults"
+    )
+    p_cmp.add_argument("left", help="Left config path")
+    p_cmp.add_argument("right", help="Right config path")
+    p_cmp.set_defaults(func=compare)
+
+    p_minchk = sub.add_parser(
+        "minimize-check",
+        help=(
+            "Exit non-zero if minimizing would change the file; args mirror `minimize`"
+        ),
+    )
+    p_minchk.add_argument("base", help="Base config path")
+    p_minchk.add_argument("config", help="Child config path")
+    p_minchk.set_defaults(func=minimize_check)
+
+    args = parser.parse_args()
+    ret = args.func(args)
+    if isinstance(ret, int):
+        sys.exit(ret)

From 3b06882ac0fc5f68afe3a3dc2d5ae610212df8f4 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 24 Sep 2025 17:06:09 +0000
Subject: [PATCH 02/15] commit

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .pre-commit-config.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6b55ea31e2..b09f6cceb3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -47,6 +47,14 @@ repos:
         additional_dependencies: []
         minimum_pre_commit_version: "2.9.2"
 
+  # The rationale behind this pre-commit hook is that we want to ensure the config is minimized and matches
+  # what you want merge in early otherwise you risk running one experiment, but when you merge the config
+  # into upstream, you'll merge with the base config and that could be an experiment with different hyperparameters.
+  # Anecdotally, this has been an issue when a SFT recipe runs without a custom chat_template, but when it merges with
+  # the default one, it gets our recommended chat_template which is not what comes from the config.
+  #
+  # You can disable this pre-commit hook if you find this disruptive, but we will expect that the config
+  # is minimized before accepting the recipe upstream.
   - repo: local
     hooks:
       - id: configs-minimize-check-llm

From 18b0e49c502cf3bd92bd77fb5b9cfa70b4c45b84 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 24 Sep 2025 17:17:12 +0000
Subject: [PATCH 03/15] fix docstring to reflect it is a script

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tools/config_cli.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tools/config_cli.py b/tools/config_cli.py
index 9583a55011..14010d8d43 100755
--- a/tools/config_cli.py
+++ b/tools/config_cli.py
@@ -19,18 +19,18 @@
 
 Example:
   # Expand a config with a root level "defaults" key to see the full config; print to stdout
-  uv run tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
+  tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
 
   # Expand a config with a root level "defaults" key to see the full config; edit the config in place
-  uv run tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place
+  tools/config_cli.py expand examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place
 
   # Minimize a config and remove all keys that are present in the base config; print to stdout
-  # uv run tools/config_cli.py minimize <base_config> <config>
-  uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
+  # tools/config_cli.py minimize <base_config> <config>
+  tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
 
   # Minimize a config and remove all keys that are present in the base config; edit the config in place
-  # uv run tools/config_cli.py minimize <base_config> <config>
-  uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place
+  # tools/config_cli.py minimize <base_config> <config>
+  tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml --in-place
 
   # Minimize all llm the configs:
   for algo in grpo dpo sft; do
@@ -39,21 +39,21 @@
       base_config=examples/configs/grpo_math_1B.yaml
     fi
     for recipe in examples/configs/recipes/llm/${algo}-*.yaml; do
-      uv run tools/config_cli.py minimize $base_config $recipe --in-place
+      tools/config_cli.py minimize $base_config $recipe --in-place
     done
   done
 
   # Minimize vlm configs:
   for recipe in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do
-    uv run tools/config_cli.py minimize examples/configs/vlm_grpo_3B.yaml $recipe --in-place
+    tools/config_cli.py minimize examples/configs/vlm_grpo_3B.yaml $recipe --in-place
   done
 
   # Compare two configs
-  uv run tools/config_cli.py compare examples/configs/grpo_math_1B.yaml examples/configs/grpo_math_8B.yaml
+  tools/config_cli.py compare examples/configs/grpo_math_1B.yaml examples/configs/grpo_math_8B.yaml
 
   # Minimize a config and compare it to not minimzing (should be the same)
-  uv run tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml >examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized
-  uv run tools/config_cli.py compare \
+  tools/config_cli.py minimize examples/configs/dpo.yaml examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml >examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized
+  tools/config_cli.py compare \
     examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml \
     examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml.minimized
 """

From 029c68f86a3034d9332f3c1c23e6db3536f31fa8 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 24 Sep 2025 17:17:21 +0000
Subject: [PATCH 04/15] minimize configs

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 ....1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml |  79 ++------
 ...po-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml |  83 ++------
 ...llama3.1-8b-instruct-4n8g-megatron.v2.yaml | 102 +---------
 ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml | 100 +---------
 .../dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml  |  48 ++---
 ...llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml |  83 +-------
 ...truct-2407-1n8g-fsdp2tp8-actckpt-long.yaml |  98 ++--------
 .../recipes/llm/grpo-deepscaler-1.5b-16K.yaml |  15 +-
 .../recipes/llm/grpo-deepscaler-1.5b-24K.yaml |  37 +---
 .../recipes/llm/grpo-deepscaler-1.5b-8K.yaml  | 134 +------------
 .../llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml  | 111 +----------
 ...ma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml | 101 +---------
 .../llm/grpo-gspo-deepscaler-1.5b-8K.yaml     | 128 +-----------
 ...lama3.1-8b-instruct-1n8g-megatron-fp8.yaml | 141 ++------------
 ...b-instruct-2n8g-fsdp2tp1-noncolocated.yaml | 110 ++---------
 ...3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml | 115 ++---------
 ...llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml | 112 +----------
 ...po-llama3.2-1b-instruct-1n8g-megatron.yaml | 135 +------------
 ...po-math-qwen3-30ba3b-megatron-tp4-32k.yaml | 160 ++-------------
 .../grpo-moonlight-16ba3b-4n8g-megatron.yaml  | 156 ++-------------
 ...-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml | 110 ++---------
 ...en2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml | 110 ++---------
 ...wen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml | 111 ++---------
 ...rpo-qwen2.5-7b-instruct-4n8g-megatron.yaml | 161 ++--------------
 ...5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml | 112 +----------
 .../llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml  | 129 ++-----------
 ...lama3.1-70b-8n8g-tp4pp2-long-megatron.yaml | 110 ++---------
 ...lama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml |  48 +----
 .../sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml   |  56 +-----
 .../llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml  |  54 +-----
 ...sft-llama3.1-8b-1n8g-megatron-seqpack.yaml |  91 +--------
 .../llm/sft-llama3.1-8b-1n8g-megatron.yaml    |  93 +--------
 .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml |  64 +-----
 ...wen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml |  57 +-----
 ...3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml | 182 +-----------------
 ...2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml | 179 +----------------
 36 files changed, 317 insertions(+), 3398 deletions(-)

diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
index 72dcb9ad1e..18a84b9cee 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
@@ -1,95 +1,44 @@
+defaults: ../../dpo.yaml
 dpo:
   max_num_epochs: 2
   max_num_steps: 20
   val_period: 50
   val_batches: 16
   val_global_batch_size: 32
-  val_micro_batch_size: 1
   val_at_start: false
-  seed: 42
-
-  reference_policy_kl_penalty: 0.05
-  preference_average_log_probs: False
-  sft_average_log_probs: ${.preference_average_log_probs}
-  preference_loss_weight: 1
   sft_loss_weight: 0.01
-
 checkpointing:
-  enabled: true
-  checkpoint_dir: "results/dpo"
-  metric_name: "val_loss"
-  higher_is_better: false
-  keep_top_k: 3
   save_period: 10000
-  checkpoint_must_save_by: null
-
 policy:
-  model_name: "meta-llama/Llama-3.1-8B-Instruct"
+  model_name: meta-llama/Llama-3.1-8B-Instruct
   tokenizer:
     name: ${policy.model_name}
   train_global_batch_size: 256
   train_micro_batch_size: 1
   max_total_sequence_length: 2048
-  precision: "bfloat16"
   dtensor_cfg:
-    enabled: true
-    cpu_offload: False
-    sequence_parallel: false
-    activation_checkpointing: false
     tensor_parallel_size: 2
-    context_parallel_size: 1
-    custom_parallel_plan: null
-  
-  dynamic_batching:
-    enabled: false
-
-  sequence_packing:
-    enabled: false
-
-  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 1.0
-
   optimizer:
-    name: "torch.optim.AdamW"
     kwargs:
-      lr: 5.0e-6
-      weight_decay: 0.1
-      betas: [0.9, 0.98]
-      eps: 1e-8
-      foreach: False
-      fused: False
-
+      eps: 1.0e-08
   scheduler:
-    - name: "torch.optim.lr_scheduler.LinearLR"
-      kwargs:
-        start_factor: 0.000000001
-        end_factor: 1.0
-        total_iters: 1
-    - name: "torch.optim.lr_scheduler.ConstantLR"
-      kwargs:
-        factor: 1.0
-        total_iters: 10000000000
-    - milestones: [1]
-
-data:
-  dataset_name: "HelpSteer3"
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  shuffle: true
-
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 1.0e-09
+      end_factor: 1.0
+      total_iters: 1
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1.0
+      total_iters: 10000000000
+  - milestones:
+    - 1
 logger:
-  log_dir: "logs"
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
-
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml
index 22851b368c..f18407fd59 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4.yaml
@@ -1,95 +1,40 @@
+defaults: ../../dpo.yaml
 dpo:
-  max_num_epochs: 1
-  max_num_steps: 150
   val_period: 50
   val_batches: 16
   val_global_batch_size: 32
-  val_micro_batch_size: 1
   val_at_start: false
-  seed: 42
-
-  reference_policy_kl_penalty: 0.05
-  preference_average_log_probs: False
-  sft_average_log_probs: ${.preference_average_log_probs}
-  preference_loss_weight: 1
   sft_loss_weight: 0.01
-
-checkpointing:
-  enabled: true
-  checkpoint_dir: "results/dpo"
-  metric_name: "val_loss"
-  higher_is_better: false
-  keep_top_k: 3
-  save_period: 50
-  checkpoint_must_save_by: null
-
 policy:
-  model_name: "meta-llama/Llama-3.1-8B-Instruct"
+  model_name: meta-llama/Llama-3.1-8B-Instruct
   tokenizer:
     name: ${policy.model_name}
   train_global_batch_size: 256
   train_micro_batch_size: 1
   max_total_sequence_length: 8192
-  precision: "bfloat16"
   dtensor_cfg:
-    enabled: true
-    cpu_offload: False
-    sequence_parallel: false
-    activation_checkpointing: false
     tensor_parallel_size: 4
-    context_parallel_size: 1
-    custom_parallel_plan: null
-
-  dynamic_batching:
-    enabled: false
-
-  sequence_packing:
-    enabled: false
-
-  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 1.0
-
   optimizer:
-    name: "torch.optim.AdamW"
     kwargs:
-      lr: 5.0e-6
-      weight_decay: 0.1
-      betas: [0.9, 0.98]
-      eps: 1e-8
-      foreach: False
-      fused: False
-
+      eps: 1.0e-08
   scheduler:
-    - name: "torch.optim.lr_scheduler.LinearLR"
-      kwargs:
-        start_factor: 0.000000001
-        end_factor: 1.0
-        total_iters: 1
-    - name: "torch.optim.lr_scheduler.ConstantLR"
-      kwargs:
-        factor: 1.0
-        total_iters: 10000000000
-    - milestones: [1]
-
-data:
-  dataset_name: "HelpSteer3"
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  shuffle: true
-
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 1.0e-09
+      end_factor: 1.0
+      total_iters: 1
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1.0
+      total_iters: 10000000000
+  - milestones:
+    - 1
 logger:
-  log_dir: "logs"
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp4
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
-
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
index f5f0b2e5d7..72ac01081d 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
@@ -1,128 +1,32 @@
+defaults: ../../dpo.yaml
 dpo:
-  max_num_epochs: 1
-  max_num_steps: 150
   val_period: 50
   val_batches: 16
   val_global_batch_size: 32
-  val_micro_batch_size: 1
   val_at_start: false
-  seed: 42
-
-  reference_policy_kl_penalty: 0.05
-  preference_average_log_probs: False
-  sft_average_log_probs: ${.preference_average_log_probs}
-  preference_loss_weight: 1
   sft_loss_weight: 0.01
-
 checkpointing:
-  enabled: false #true
-  checkpoint_dir: "results/dpo"
-  metric_name: "val_loss"
-  higher_is_better: false
-  keep_top_k: 3
-  save_period: 50
-  checkpoint_must_save_by: null
-
+  enabled: false
 policy:
-  model_name: "meta-llama/Llama-3.1-8B-Instruct"
+  model_name: meta-llama/Llama-3.1-8B-Instruct
   tokenizer:
     name: ${policy.model_name}
   train_global_batch_size: 256
   train_micro_batch_size: 1
   max_total_sequence_length: 8192
-  precision: "bfloat16"
   dtensor_cfg:
     enabled: false
-
-  dynamic_batching:
-    enabled: false
-
-  sequence_packing:
-    enabled: false
-
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
-  max_grad_norm: 1.0
-
   optimizer: null
-
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
     tensor_model_parallel_size: 4
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: true
-    freeze_moe_router: false
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "aux_loss"
-    moe_router_bias_update_rate: 1e-3
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6 #4.0e-5
-      min_lr: 5.0e-6 #4.0e-5
-      weight_decay: 0.1
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_warmup_iters: 1
-      lr_warmup_init: 0.00000001
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-data:
-  dataset_name: "HelpSteer3"
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  shuffle: true
-
 logger:
-  log_dir: "logs"
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
-
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
index 9dd723ec22..78c3e80336 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
@@ -1,128 +1,34 @@
+defaults: ../../dpo.yaml
 dpo:
-  max_num_epochs: 1
   max_num_steps: 20
   val_period: 50
   val_batches: 16
   val_global_batch_size: 32
-  val_micro_batch_size: 1
   val_at_start: false
-  seed: 42
-
-  reference_policy_kl_penalty: 0.05
-  preference_average_log_probs: False
-  sft_average_log_probs: ${.preference_average_log_probs}
-  preference_loss_weight: 1
   sft_loss_weight: 0.01
-
 checkpointing:
-  enabled: false #true
-  checkpoint_dir: "results/dpo"
-  metric_name: "val_loss"
-  higher_is_better: false
-  keep_top_k: 3
+  enabled: false
   save_period: 10000
-  checkpoint_must_save_by: null
-
 policy:
-  model_name: "meta-llama/Llama-3.1-8B-Instruct"
+  model_name: meta-llama/Llama-3.1-8B-Instruct
   tokenizer:
     name: ${policy.model_name}
   train_global_batch_size: 256
   train_micro_batch_size: 1
   max_total_sequence_length: 2048
-  precision: "bfloat16"
   dtensor_cfg:
     enabled: false
-
-  dynamic_batching:
-    enabled: false
-
-  sequence_packing:
-    enabled: false
-
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
-  max_grad_norm: 1.0
-
   optimizer: null
-
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
-    tensor_model_parallel_size: 2
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
     pipeline_model_parallel_size: 2
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: true
-    freeze_moe_router: false
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "aux_loss"
-    moe_router_bias_update_rate: 1e-3
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6 #4.0e-5
-      min_lr: 5.0e-6 #4.0e-5
-      weight_decay: 0.1
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
-    scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_warmup_iters: 1
-      lr_warmup_init: 0.00000001
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-data:
-  dataset_name: "HelpSteer3"
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  shuffle: true
-
 logger:
-  log_dir: "logs"
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
-
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml
index c3398a6cd5..3527838c62 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-tulu3-1n8g-fsdp2tp1.yaml
@@ -1,51 +1,43 @@
-defaults: "../../dpo.yaml"
-
+defaults: ../../dpo.yaml
 cluster:
-  num_nodes: 1
   gpus_per_node: 8
-
 policy:
-  model_name: "allenai/Llama-3.1-Tulu-3-8B-SFT"
+  model_name: allenai/Llama-3.1-Tulu-3-8B-SFT
   tokenizer:
-    name: "allenai/Llama-3.1-Tulu-3-8B-SFT"
+    name: allenai/Llama-3.1-Tulu-3-8B-SFT
   train_micro_batch_size: 1
-  train_global_batch_size: 128
   max_total_sequence_length: 2048
   optimizer:
-    name: "torch.optim.AdamW"
     kwargs:
-      lr: 5.0e-7
+      lr: 5.0e-07
       weight_decay: 0.0
   scheduler:
-    - name: "torch.optim.lr_scheduler.LinearLR"
-      kwargs:
-        start_factor: 1.0e-6
-        end_factor: 1.0
-        total_iters: 211
-    - name: "torch.optim.lr_scheduler.LinearLR"
-      kwargs:
-        start_factor: 1.0
-        end_factor: 0.0
-        total_iters: 1899
-    - milestones: [211]
-
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 1.0e-06
+      end_factor: 1.0
+      total_iters: 211
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 1.0
+      end_factor: 0.0
+      total_iters: 1899
+  - milestones:
+    - 211
 data:
-  dataset_name: "Tulu3Preference"
-
+  dataset_name: Tulu3Preference
 dpo:
   max_num_steps: 2110
   val_period: -1
   val_at_start: false
-  preference_average_log_probs: True
+  preference_average_log_probs: true
   reference_policy_kl_penalty: 5
   val_micro_batch_size: ${policy.train_micro_batch_size}
   val_global_batch_size: ${policy.train_global_batch_size}
-
 checkpointing:
   metric_name: null
   save_period: 250
-
 logger:
-  wandb_enabled: True
+  wandb_enabled: true
   wandb:
-    name: "dpo-tulu3-8b"
+    name: dpo-tulu3-8b
diff --git a/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml
index 22870f0e66..252251fd76 100644
--- a/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml
@@ -1,96 +1,15 @@
+defaults: ../../dpo.yaml
 dpo:
-  max_num_epochs: 1
-  max_num_steps: 150
-  val_period: 25
-  val_batches: 8
   val_global_batch_size: 32
-  val_micro_batch_size: 1
   val_at_start: false
-  seed: 42
-
-  reference_policy_kl_penalty: 0.05
-  preference_average_log_probs: False
-  sft_average_log_probs: ${.preference_average_log_probs}
-  preference_loss_weight: 1
-  sft_loss_weight: 0
-
-checkpointing:
-  enabled: true
-  checkpoint_dir: "results/dpo"
-  metric_name: "val_loss"
-  higher_is_better: false
-  keep_top_k: 3
-  save_period: 50
-  checkpoint_must_save_by: null
-
 policy:
-  model_name: "meta-llama/Llama-3.2-1B-Instruct"
   tokenizer:
     name: ${policy.model_name}
-
-  train_global_batch_size: 128
-  train_micro_batch_size: 2
-  max_total_sequence_length: 1024
-  precision: "bfloat16"
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: False
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
-
-  dynamic_batching:
-    enabled: false
-
-  sequence_packing:
-    enabled: false
-
-  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 1.0
-
-  optimizer:
-    name: "torch.optim.AdamW"
-    kwargs:
-      lr: 5.0e-6
-      weight_decay: 0.1
-      betas: [0.9, 0.98]
-      eps: 1e-5
-      foreach: False
-      fused: False
-    
-  scheduler:
-    - name: "torch.optim.lr_scheduler.LinearLR"
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1.0
-        total_iters: 20
-    - name: "torch.optim.lr_scheduler.ConstantLR"
-      kwargs:
-        factor: 1.0
-        total_iters: 10000000000
-    - milestones: [20]
-    
-data:
-  dataset_name: "HelpSteer3"
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  shuffle: true
-
 logger:
-  log_dir: "logs"
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
-
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.yaml b/examples/configs/recipes/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.yaml
index 86a3a6fc97..9833aa30d0 100644
--- a/examples/configs/recipes/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.yaml
+++ b/examples/configs/recipes/llm/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long.yaml
@@ -1,108 +1,46 @@
-# DPO Algorithm Configuration
+defaults: ../../dpo.yaml
 dpo:
-  max_num_epochs: 1
   max_num_steps: 100
   val_period: 10
   val_batches: 1
   val_global_batch_size: 16
-  val_micro_batch_size: 1
-  val_at_start: true
-  seed: 42
-
   reference_policy_kl_penalty: 0.1
-  preference_average_log_probs: False # whether normalizing log probs according to the sequence length in preference_loss
-  sft_average_log_probs: ${.preference_average_log_probs} # whether normalizing log probs according to the sequence length in sft_loss
-
-  preference_loss_weight: 1 # the coefficient of the preference loss
-  sft_loss_weight: 0 # the coefficient of the SFT loss
-
 checkpointing:
-  enabled: true
-  checkpoint_dir: "results/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long"
-  metric_name: "val_loss"
-  higher_is_better: false
+  checkpoint_dir: results/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long
   keep_top_k: null
-  save_period: 50
-  checkpoint_must_save_by: null
-
 policy:
-  model_name: "mistralai/Mistral-Nemo-Instruct-2407"
+  model_name: mistralai/Mistral-Nemo-Instruct-2407
   tokenizer:
     name: ${policy.model_name}
-
-  # number of preference samples per batch
-  # each preference sample corresponds to a pair of chosen and rejected responses
-  # so the actual batch size processed by the model is train_global_batch_size * 2
   train_global_batch_size: 8
   train_micro_batch_size: 1
-
-
-  #logprob_batch_size: ${policy.train_micro_batch_size}
   max_total_sequence_length: 12288
-  precision: "bfloat16"
-
   dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
     activation_checkpointing: true
     tensor_parallel_size: 8
-    context_parallel_size: 1
-    custom_parallel_plan: null
     clear_cache_every_n_steps: 1
     env_vars:
-      PYTORCH_CUDA_ALLOC_CONF: "max_split_size_mb:64"
-
-  dynamic_batching:
-    enabled: false
-
-  sequence_packing:
-    enabled: false
-
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
-  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 1.0
-
+      PYTORCH_CUDA_ALLOC_CONF: max_split_size_mb:64
   optimizer:
-    name: "torch.optim.AdamW"
     kwargs:
-      lr: 1.0e-6
+      lr: 1.0e-06
       weight_decay: 0.01
-      betas: [0.9, 0.999]
-      eps: 1e-8
-      # when using Dtensor, we need to set foreach
-      # and fused to False
-      foreach: False
-      fused: False
-
+      betas:
+      - 0.9
+      - 0.999
+      eps: 1.0e-08
   scheduler:
-    - name: "torch.optim.lr_scheduler.ConstantLR"
-      kwargs:
-        factor: 1.0
-        total_iters: 10000000000
-    - milestones: []
-
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1.0
+      total_iters: 10000000000
+  - milestones: []
 data:
-  dataset_name: "HelpSteer3"
-  shuffle: False
-  max_input_seq_length: ${policy.max_total_sequence_length}
-
+  shuffle: false
 logger:
-  log_dir: "logs/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long"  # Base directory for all logs
-  wandb_enabled: false # Make sure you do a ``wandb login [Your API key]'' before running
-  tensorboard_enabled: false
-  mlflow_enabled: false
-  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
+  log_dir: logs/dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long
   wandb:
-    project: "nemo-rl"
-    name: "dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long"
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
+    project: nemo-rl
+    name: dpo-mistral-nemo-instruct-2407-1n8g-fsdp2tp8-actckpt-long
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml
index 570fecb1b9..8fc6eccbdd 100644
--- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml
+++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml
@@ -1,23 +1,14 @@
-# GRPO Algorithm Configuration
-defaults: "grpo-deepscaler-1.5b-8K.yaml"
-
+defaults:
+- ../../grpo_math_1B.yaml
+- grpo-deepscaler-1.5b-8K.yaml
 loss_fn:
   reference_policy_kl_penalty: 0.001
   ratio_clip_max: 0.28
-
-
 policy:
   max_total_sequence_length: 16384
   logprob_batch_size: 2
-
   dtensor_cfg:
-    enabled: true
     cpu_offload: true
     sequence_parallel: true
     activation_checkpointing: true
     tensor_parallel_size: 2
-    context_parallel_size: 1
-    custom_parallel_plan: null
-
-  dynamic_batching:
-    enabled: False
diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml
index 3cd8fabd6d..2bf34c47d1 100644
--- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml
+++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml
@@ -1,48 +1,23 @@
-# GRPO Algorithm Configuration
-defaults: "grpo-deepscaler-1.5b-8K.yaml"
-
+defaults:
+- ../../grpo_math_1B.yaml
+- grpo-deepscaler-1.5b-8K.yaml
 loss_fn:
   reference_policy_kl_penalty: 0.0001
-  ratio_clip_min: 0.2
   ratio_clip_max: 0.28
-
 policy:
   max_total_sequence_length: 24576
   logprob_batch_size: 2
-
   dtensor_cfg:
-    enabled: true
     cpu_offload: true
     sequence_parallel: true
     activation_checkpointing: true
     tensor_parallel_size: 2
-    context_parallel_size: 1
-    custom_parallel_plan: null
-
-  dynamic_batching:
-    enabled: False
-
   sequence_packing:
-    enabled: False
-
+    enabled: false
   optimizer:
-    name: "torch.optim.AdamW"
     kwargs:
-      lr: 5.0e-7
-
+      lr: 5.0e-07
   generation:
-    backend: "vllm"
-    max_new_tokens: ${policy.max_total_sequence_length}
-    temperature: 1.0
-    top_p: 1.0
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
     vllm_cfg:
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
       gpu_memory_utilization: 0.8
-      enforce_eager: True
-      max_model_len: ${policy.max_total_sequence_length}
+      enforce_eager: true
diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml
index f6cc626890..48d3317e81 100644
--- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml
+++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml
@@ -1,157 +1,37 @@
-# GRPO Algorithm Configuration
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 128
   num_generations_per_prompt: 8
-  max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
-  max_num_epochs: 1
-  max_num_steps: 1000000
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
   max_val_samples: 480
   val_batch_size: 32
-  seed: 42
-  overlong_filtering: false
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
 loss_fn:
   reference_policy_kl_penalty: 0.0
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
-
 checkpointing:
-  enabled: true
-  checkpoint_dir: "results/grpo"
-  metric_name: "val_reward"
-  higher_is_better: true
   keep_top_k: 10
-  save_period: 10
-  checkpoint_must_save_by: null
-
 policy:
-  model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  model_name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
   train_global_batch_size: 64
   train_micro_batch_size: 1
-  generation_batch_size: 32 # Only used when generating using HF backend
-  logprob_batch_size: 4
   max_total_sequence_length: 8192
-  precision: "bfloat16"
-
   dtensor_cfg:
-    enabled: true
     cpu_offload: true
     sequence_parallel: true
     activation_checkpointing: true
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
-
-  dynamic_batching:
-    enabled: False
-
   sequence_packing:
-    enabled: False
-
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
-  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 1.0
-
+    enabled: false
   optimizer:
-    name: "torch.optim.AdamW"
     kwargs:
-      lr: 2.0e-6
-      weight_decay: 0.01
-      betas: [0.9, 0.999]
-      eps: 1e-8
-      # when using Dtensor, we need to set foreach
-      # and fused to False
-      foreach: False
-      fused: False
-
-  scheduler:
-    - name: "torch.optim.lr_scheduler.LinearLR"
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1.0
-        total_iters: 50
-    - name: "torch.optim.lr_scheduler.ConstantLR"
-      kwargs:
-        factor: 1.0
-        total_iters: 10000000000
-    - milestones: [50]
-
+      lr: 2.0e-06
   generation:
-    backend: "vllm"
-    max_new_tokens: ${policy.max_total_sequence_length}
-    temperature: 1.0
-    top_p: 1.0
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
-    vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
-      max_model_len: ${policy.max_total_sequence_length}
-      enforce_eager: False
     vllm_kwargs:
       compilation_config:
-        # when enforce_eager is False, set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy,
-        # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
-        # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
-        use_inductor: False
-    colocated:
-      # true: generation shares training GPUs
-      # false: uses dedicated generation resources
-      enabled: true
-      # only relevant when enabled is false
-      resources:
-        gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
-        num_nodes: null # Decides number of nodes to be dedicated to generation
-
+        use_inductor: false
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
-  prompt_file: "examples/prompts/cot.txt"
-  system_prompt_file: null
-  dataset_name: "DeepScaler"
-  shuffle: true
-
+  dataset_name: DeepScaler
 env:
   math:
     num_workers: 16
-
 logger:
-  log_dir: "logs"  # Base directory for all logs
-  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
-  wandb_enabled: false
-  tensorboard_enabled: false
-  mlflow_enabled: false
-  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  wandb:
-    project: "grpo-dev"
-    name: "grpo-dev-logger"
-  tensorboard: {}
-  mlflow:
-    experiment_name: "grpo-dev"
-    run_name: "grpo-dev-logger"
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
+  monitor_gpus: false
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
index 091cb2909a..15ca65c8f9 100644
--- a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
+++ b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
@@ -1,136 +1,29 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
-  num_prompts_per_step: 32
-  num_generations_per_prompt: 16
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 500
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  overlong_filtering: false
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-gemma3-1b-it-1n8g-fsdp2tp1
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: google/gemma-3-1b-it
   tokenizer:
     name: google/gemma-3-1b-it
-  train_global_batch_size: 512
-  train_micro_batch_size: 4
-  generation_batch_size: 32
-  logprob_batch_size: 4
-  max_total_sequence_length: 512
-  precision: bfloat16
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
   dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
+    enabled: true
   sequence_packing:
     enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
-  optimizer:
-    name: torch.optim.AdamW
-    kwargs:
-      lr: 5e-06
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
-  scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 50
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 50
   generation:
-    backend: vllm
     max_new_tokens: 512
-    temperature: 1
-    top_p: 1
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 512
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 512
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-gemma3-1b-it-1n8g-fsdp2tp1
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-gemma3-1b-it-1n8g-fsdp2tp1
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml
index 4c3351970c..c50ea4834b 100644
--- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml
+++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml
@@ -1,137 +1,40 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 64
   num_generations_per_prompt: 32
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 20
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  overlong_filtering: false
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-gemma3-27b-it-8n8g-fsdp2tp8sp-actckpt-long
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: google/gemma-3-27b-it
   tokenizer:
     name: google/gemma-3-27b-it
-  train_global_batch_size: 512
   train_micro_batch_size: 1
-  generation_batch_size: 32
   logprob_batch_size: 2
   max_total_sequence_length: 16384
-  precision: bfloat16
   dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
     activation_checkpointing: true
     tensor_parallel_size: 8
-    context_parallel_size: 1
-    custom_parallel_plan: null
-  dynamic_batching:
-    # TODO: OOMs if enabled https://github.com/NVIDIA-NeMo/RL/issues/383
-    enabled: False
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
   sequence_packing:
     enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 8
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 3e-07
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
-  scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 50
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 50
+      lr: 3.0e-07
   generation:
-    backend: vllm
     max_new_tokens: 16384
-    temperature: 1
-    top_p: 1
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
       tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 16384
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 16384
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-gemma3-27b-it-8n8g-fsdp2tp8sp-actckpt-long
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-gemma3-27b-it-8n8g-fsdp2tp8sp-actckpt-long
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
   num_nodes: 8
diff --git a/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml
index e1b7c4d809..547b4c4382 100644
--- a/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml
+++ b/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml
@@ -1,152 +1,38 @@
-# GRPO Algorithm Configuration
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 128
   num_generations_per_prompt: 8
-  max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
-  max_num_epochs: 1
-  max_num_steps: 1000000
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  overlong_filtering: false
   max_val_samples: 480
   val_batch_size: 32
-  seed: 42
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
 loss_fn:
   reference_policy_kl_penalty: 0.0
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
   sequence_level_importance_ratios: true
   token_level_loss: false
-
 checkpointing:
-  enabled: true
-  checkpoint_dir: "results/grpo"
-  metric_name: "val_reward"
-  higher_is_better: true
   keep_top_k: 10
-  save_period: 10
-  checkpoint_must_save_by: null
-
 policy:
-  model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  model_name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
   train_global_batch_size: 64
   train_micro_batch_size: 1
-  generation_batch_size: 32 # Only used when generating using HF backend
-  logprob_batch_size: 4
   max_total_sequence_length: 8192
-  precision: "bfloat16"
-
   dtensor_cfg:
-    enabled: true
     cpu_offload: true
     sequence_parallel: true
     activation_checkpointing: true
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
-
-  dynamic_batching:
-    enabled: False
-
   sequence_packing:
-    enabled: False
-
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
-  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 1.0
-
+    enabled: false
   optimizer:
-    name: "torch.optim.AdamW"
     kwargs:
-      lr: 2.0e-6
-      weight_decay: 0.01
-      betas: [0.9, 0.999]
-      eps: 1e-8
-      # when using Dtensor, we need to set foreach
-      # and fused to False
-      foreach: False
-      fused: False
-
-  scheduler:
-    - name: "torch.optim.lr_scheduler.LinearLR"
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1.0
-        total_iters: 50
-    - name: "torch.optim.lr_scheduler.ConstantLR"
-      kwargs:
-        factor: 1.0
-        total_iters: 10000000000
-    - milestones: [50]
-
+      lr: 2.0e-06
   generation:
-    backend: "vllm"
-    max_new_tokens: ${policy.max_total_sequence_length}
-    temperature: 1.0
-    top_p: 1.0
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
-      max_model_len: ${policy.max_total_sequence_length}
-      enforce_eager: True
-    colocated:
-      # true: generation shares training GPUs
-      # false: uses dedicated generation resources
-      enabled: true
-      # only relevant when enabled is false
-      resources:
-        gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
-        num_nodes: null # Decides number of nodes to be dedicated to generation
-
+      enforce_eager: true
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
-  prompt_file: "examples/prompts/cot.txt"
-  system_prompt_file: null
-  dataset_name: "DeepScaler"
-  shuffle: true
-
+  dataset_name: DeepScaler
 env:
   math:
     num_workers: 16
-
 logger:
-  log_dir: "logs"  # Base directory for all logs
-  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
-  wandb_enabled: false
-  tensorboard_enabled: false
-  mlflow_enabled: false
-  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  wandb:
-    project: "grpo-dev"
-    name: "grpo-dev-logger"
-  tensorboard: {}
-  mlflow:
-    experiment_name: "grpo-dev"
-    run_name: "grpo-dev-logger"
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
+  monitor_gpus: false
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml
index 81ca15f6bd..a61133c358 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml
@@ -1,169 +1,52 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 64
   num_generations_per_prompt: 32
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 500
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  overlong_filtering: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
 loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: True
-  token_level_loss: true
+  use_importance_sampling_correction: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.1-8B-Instruct
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
-  train_global_batch_size: 512
   train_micro_batch_size: 1
-  generation_batch_size: 32
   logprob_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
-
   dtensor_cfg:
-    enabled: False
-
-  dynamic_batching:
-    enabled: False
-
-  sequence_packing:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
-
+    enabled: false
   megatron_cfg:
-    enabled: True
+    enabled: true
     empty_unused_memory_level: 1
-    converter_type: "LlamaForCausalLM"
-    tensor_model_parallel_size: 1
+    converter_type: LlamaForCausalLM
     pipeline_model_parallel_size: 2
-    context_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    sequence_parallel: False
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    freeze_moe_router: True
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    apply_rope_fusion: True
-    activation_checkpointing: True
-    defer_fp32_logits: True
-
+    activation_checkpointing: true
+    defer_fp32_logits: true
     optimizer:
-      optimizer: "adam"
-      lr: 5.0e-7
-      min_lr: 5.0e-8
+      lr: 5.0e-07
+      min_lr: 5.0e-08
       weight_decay: 0.0
-      bf16: True
-      fp16: False
-      params_dtype: "float32"
-
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      use_distributed_optimizer: True
-      use_precision_aware_optimizer: True
-
-      clip_grad: ${policy.max_grad_norm}
-
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
       lr_warmup_iters: 2
-      lr_warmup_init: 5.0e-8
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: False
-      overlap_grad_reduce: True
-      overlap_param_gather: True
-      average_in_collective: True
-      use_custom_fsdp: False
-      data_parallel_sharding_strategy: "optim_grads_params"
-
+      lr_warmup_init: 5.0e-08
   generation:
-    backend: vllm
     max_new_tokens: 4096
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 128009
-    stop_strings: null
+    - 128009
     vllm_cfg:
-      async_engine: false
-      precision: 'fp8'
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
+      precision: fp8
       max_model_len: 4096
-      enforce_eager: False
       use_deep_gemm: true
-      num_last_layers_in_bf16: 0
-      num_first_layers_in_bf16: 0
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 4096
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-llama3.1-8b-instruct-1n8g-megatron-fp8
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml
index 17b474bd72..052d082328 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml
@@ -1,137 +1,57 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 64
   num_generations_per_prompt: 32
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 500
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  overlong_filtering: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.1-8B-Instruct
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
-  train_global_batch_size: 512
   train_micro_batch_size: 1
-  generation_batch_size: 32
   logprob_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
   dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
+    enabled: true
   sequence_packing:
     enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 3e-07
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
+      lr: 3.0e-07
   scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 13
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 13
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1
+      total_iters: 13
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1
+      total_iters: 10000000000
+  - milestones:
+    - 13
   generation:
-    backend: vllm
     max_new_tokens: 4096
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 128009
-    stop_strings: null
+    - 128009
     vllm_cfg:
       async_engine: true
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 4096
-      enforce_eager: False
     colocated:
       enabled: false
       resources:
-        gpus_per_node: null
         num_nodes: 1
 data:
   max_input_seq_length: 4096
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
   num_nodes: 2
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml
index 1c2b3840ca..df9181f660 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml
@@ -1,137 +1,52 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 64
   num_generations_per_prompt: 32
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 500
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  overlong_filtering: false
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.1-8B-Instruct
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
-  train_global_batch_size: 512
   train_micro_batch_size: 1
-  generation_batch_size: 32
   logprob_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
   dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
+    enabled: true
   sequence_packing:
     enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 3e-07
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
+      lr: 3.0e-07
   scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 13
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 13
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1
+      total_iters: 13
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1
+      total_iters: 10000000000
+  - milestones:
+    - 13
   generation:
-    backend: vllm
     max_new_tokens: 4096
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 128009
-    stop_strings: null
+    - 128009
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 4096
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 4096
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
index eddf09bf97..fce039a321 100644
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -1,137 +1,31 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
-  num_prompts_per_step: 32
-  num_generations_per_prompt: 16
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 500
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  overlong_filtering: false
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.2-1B-Instruct
   tokenizer:
     name: meta-llama/Llama-3.2-1B-Instruct
-  train_global_batch_size: 512
-  train_micro_batch_size: 4
-  generation_batch_size: 32
-  logprob_batch_size: 4
-  max_total_sequence_length: 512
-  precision: bfloat16
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
   dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
+    enabled: true
   sequence_packing:
     enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
-  optimizer:
-    name: torch.optim.AdamW
-    kwargs:
-      lr: 5e-06
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
-  scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 50
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 50
   generation:
-    backend: vllm
     max_new_tokens: 512
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 128009
-    stop_strings: null
+    - 128009
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 512
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 512
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
index 4ad29901fa..48f00c626e 100755
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
@@ -1,167 +1,36 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
-  num_prompts_per_step: 32
-  num_generations_per_prompt: 16
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 500
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  overlong_filtering: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
   enabled: false
   checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
   save_period: 100
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.2-1B-Instruct
   tokenizer:
     name: meta-llama/Llama-3.2-1B-Instruct
-  train_global_batch_size: 512
-  train_micro_batch_size: 4
-  generation_batch_size: 32
-  logprob_batch_size: 4
-  max_total_sequence_length: 512
-  precision: bfloat16
   optimizer: null
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 0
-    activation_checkpointing: false
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6
-      min_lr: 5.0e-7
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
       lr_warmup_iters: 50
-      lr_warmup_init: 5.0e-7
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
   dtensor_cfg:
     enabled: false
-  dynamic_batching:
-    enabled: False
-  sequence_packing:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
   generation:
-    backend: vllm
     max_new_tokens: 512
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 128009
-    stop_strings: null
+    - 128009
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 512
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 512
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: False
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-llama3.2-1b-instruct-1n8g-megatron
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
index 507b1eefd8..e2c2582194 100644
--- a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
+++ b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
@@ -1,175 +1,53 @@
+defaults: ../../grpo_math_1B.yaml
 checkpointing:
-  enabled: True
   checkpoint_dir: results/grpo-math-qwen3-30ba3b-megatron-tp4-32k
   save_period: 3
   keep_top_k: 1
-  metric_name: val_reward
-  higher_is_better: True
-  checkpoint_must_save_by: null
-
 grpo:
-  normalize_rewards: True
-  use_leave_one_out_baseline: True
-  max_num_epochs: 1
   max_num_steps: 3
   num_prompts_per_step: 64
-  num_generations_per_prompt: 16
-  max_rollout_turns: 1
   val_period: 3
-  val_at_start: False
-  overlong_filtering: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
-  use_on_policy_kl_approximation: False
-  use_importance_sampling_correction: False
-  token_level_loss: True
-  ratio_clip_c: null
-
 policy:
-  model_name: "Qwen/Qwen3-30B-A3B"
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-  train_global_batch_size: 512
+  model_name: Qwen/Qwen3-30B-A3B
   train_micro_batch_size: 1
-  generation_batch_size: 32 # Only used when generating using HF backend
   logprob_batch_size: 1
   max_total_sequence_length: 32768
-  precision: "bfloat16"
   logprob_chunk_size: 2048
-
   dtensor_cfg:
-    enabled: False
-
-  dynamic_batching:
-    enabled: False
-
+    enabled: false
   sequence_packing:
-    enabled: False
-
-  max_grad_norm: 1.0
+    enabled: false
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
-
-  optimizer: null # remove default FSDP optimizer
-
-  scheduler: null # remove default FSDP scheduler
-
+  optimizer: null
+  scheduler: null
   megatron_cfg:
-    enabled: True
+    enabled: true
     empty_unused_memory_level: 1
-    converter_type: "LlamaForCausalLM"
+    converter_type: LlamaForCausalLM
     tensor_model_parallel_size: 4
-    pipeline_model_parallel_size: 1
-    context_parallel_size: 1
-    expert_tensor_parallel_size: 1
     expert_model_parallel_size: 8
-    sequence_parallel: True
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    freeze_moe_router: True
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    apply_rope_fusion: True
-    activation_checkpointing: True
-    defer_fp32_logits: True
-
+    sequence_parallel: true
+    activation_checkpointing: true
+    defer_fp32_logits: true
     optimizer:
-      optimizer: "adam"
-      lr: 5.0e-7
-      min_lr: 5.0e-8
+      lr: 5.0e-07
+      min_lr: 5.0e-08
       weight_decay: 0.0
-      bf16: True
-      fp16: False
-      params_dtype: "float32"
-
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      use_distributed_optimizer: True
-      use_precision_aware_optimizer: True
-
-      clip_grad: ${policy.max_grad_norm}
-
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
       lr_warmup_iters: 2
-      lr_warmup_init: 5.0e-8
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: False
-      overlap_grad_reduce: True
-      overlap_param_gather: True
-      average_in_collective: True
-      use_custom_fsdp: False
-      data_parallel_sharding_strategy: "optim_grads_params"
-    
+      lr_warmup_init: 5.0e-08
   generation:
-    backend: "vllm"
-    max_new_tokens: ${policy.max_total_sequence_length}
-    temperature: 1.0
-    top_p: 1.0
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
     vllm_cfg:
-      async_engine: False
-      precision: ${policy.precision}
       tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
-      max_model_len: ${policy.max_total_sequence_length}
-      # NB(pjin): https://github.com/NVIDIA-NeMo/RL/pull/857
-      enforce_eager: True
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
-
-data:
-  dataset_name: "OpenMathInstruct-2"
-  shuffle: true
-  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
-  prompt_file: "examples/prompts/cot.txt"
-  system_prompt_file: null
-
-env:
-  math:
-    num_workers: 8
-
+      enforce_eager: true
 logger:
   log_dir: logs/grpo-math-qwen3-30ba3b-megatron-tp4-32k
-  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
-  wandb_enabled: True
-  tensorboard_enabled: True
-  mlflow_enabled: False  # Disable MLflow logging
-  monitor_gpus: False  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: false
   wandb:
     project: nemo-rl
-    name: "grpo-math-qwen3-30ba3b-megatron-tp4-32k"
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
+    name: grpo-math-qwen3-30ba3b-megatron-tp4-32k
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
index a0784ba746..e1e38fbbfc 100644
--- a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron.yaml
@@ -1,174 +1,40 @@
-# GRPO Algorithm Configuration
-defaults: "../../grpo_math_1B.yaml"
-
+defaults: ../../grpo_math_1B.yaml
 grpo:
-  num_prompts_per_step: 32
-  num_generations_per_prompt: 16
-  max_num_epochs: 1
-  max_num_steps: 1000000
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
   val_period: -1
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1  
-
 loss_fn:
   reference_policy_kl_penalty: 0.04
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
-  ratio_clip_c: null
-
 checkpointing:
   enabled: false
-  checkpoint_dir: "results/grpo_megatron"
-  metric_name: "val_reward"
-  higher_is_better: true
-  keep_top_k: 3
+  checkpoint_dir: results/grpo_megatron
   save_period: 10000
-
 policy:
-  model_name: "moonshotai/Moonlight-16B-A3B-Instruct"
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-  train_global_batch_size: 512
+  model_name: moonshotai/Moonlight-16B-A3B-Instruct
   train_micro_batch_size: 1
-  generation_batch_size: 64 # Only used when generating using megatron backend
+  generation_batch_size: 64
   logprob_batch_size: 1
   max_total_sequence_length: 8192
-  precision: "bfloat16"
-
   dtensor_cfg:
     enabled: false
-
-  # dynamic_batching improves performance by ensuring logprob and training microbatches
-  # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
-  # responses are sorted by sequence length and bucketed into microbatches with a total
-  # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the
-  # training and logprob stages respectively.
-  dynamic_batching:
-    enabled: False
-
   sequence_packing:
-    enabled: False # coming soon
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_ffd"
-    sequence_length_round: 64
-
-  max_grad_norm: 1.0
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
+    enabled: false
+    algorithm: modified_ffd
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
-
-  optimizer: null # remove default FSDP optimizer
-
+  optimizer: null
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 0
-    activation_checkpointing: false
-    converter_type: "Qwen2ForCausalLM"
-    tensor_model_parallel_size: 1
-    expert_tensor_parallel_size: 1
     expert_model_parallel_size: 4
     pipeline_model_parallel_size: 4
     num_layers_in_first_pipeline_stage: 7
     num_layers_in_last_pipeline_stage: 6
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    # Causes logprob error divergence for moonlight
-    apply_rope_fusion: False
-    
+    apply_rope_fusion: false
     optimizer:
-      optimizer: "adam"
-      lr: 1.0e-6
-      min_lr: 5.0e-7
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
+      lr: 1.0e-06
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
       lr_warmup_iters: 50
-      lr_warmup_init: 5.0e-7
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-  generation:
-    backend: "vllm"
-    max_new_tokens: ${policy.max_total_sequence_length}
-    temperature: 1.0
-    top_p: 1.0
-    top_k: null
-    vllm_cfg:
-      tensor_parallel_size: 1
-      gpu_memory_utilization: 0.6
-      max_model_len: ${policy.max_total_sequence_length}
-
-data:
-  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
-  prompt_file: "examples/prompts/cot.txt"
-  system_prompt_file: null
-  dataset_name: "OpenMathInstruct-2"
-
-env:
-  math:
-    num_workers: 8
-
 logger:
-  log_dir: "logs"  # Base directory for all logs
-  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
-  wandb_enabled: false
-  tensorboard_enabled: false
-  mlflow_enabled: False
-  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  monitor_gpus: false
   wandb:
-    project: "grpo-dev"
-    name: "grpo-moonlight-16B-A3B-Instruct"
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
+    name: grpo-moonlight-16B-A3B-Instruct
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml
index 7fd4007279..b5aaf22ceb 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml
@@ -1,137 +1,57 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 64
   num_generations_per_prompt: 32
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 20
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  overlong_filtering: false
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: Qwen/Qwen2.5-32B
   tokenizer:
     name: Qwen/Qwen2.5-32B
-  train_global_batch_size: 512
   train_micro_batch_size: 1
-  generation_batch_size: 32
   logprob_batch_size: 2
   max_total_sequence_length: 16384
-  precision: bfloat16
   dtensor_cfg:
-    enabled: true
-    cpu_offload: false
     sequence_parallel: true
     activation_checkpointing: true
     tensor_parallel_size: 8
-    context_parallel_size: 1
-    custom_parallel_plan: null
   dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
+    enabled: true
   sequence_packing:
     enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 8
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 3e-07
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
+      lr: 3.0e-07
   scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 13
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 13
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1
+      total_iters: 13
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1
+      total_iters: 10000000000
+  - milestones:
+    - 13
   generation:
-    backend: vllm
     max_new_tokens: 16384
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 151643
-    stop_strings: null
+    - 151643
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
       tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 16384
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 16384
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
   num_nodes: 32
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml
index f163092404..44c2f7f8eb 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml
@@ -1,137 +1,57 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 64
   num_generations_per_prompt: 32
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 2
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  overlong_filtering: false
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: Qwen/Qwen2.5-32B
   tokenizer:
     name: Qwen/Qwen2.5-32B
-  train_global_batch_size: 512
   train_micro_batch_size: 1
-  generation_batch_size: 32
   logprob_batch_size: 2
   max_total_sequence_length: 16384
-  precision: bfloat16
   dtensor_cfg:
-    enabled: true
-    cpu_offload: false
     sequence_parallel: true
     activation_checkpointing: true
     tensor_parallel_size: 8
-    context_parallel_size: 1
-    custom_parallel_plan: null
   dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
+    enabled: true
   sequence_packing:
     enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 8
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 3e-07
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
+      lr: 3.0e-07
   scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 13
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 13
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1
+      total_iters: 13
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1
+      total_iters: 10000000000
+  - milestones:
+    - 13
   generation:
-    backend: vllm
     max_new_tokens: 16384
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 151643
-    stop_strings: null
+    - 151643
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
       tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 16384
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 16384
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
   num_nodes: 32
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml
index f6ecc1e390..98e7eadedd 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml
@@ -1,137 +1,56 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 64
   num_generations_per_prompt: 32
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 30
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  overlong_filtering: false
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: Qwen/Qwen2.5-7B-Instruct
   tokenizer:
     name: Qwen/Qwen2.5-7B-Instruct
-  train_global_batch_size: 512
   train_micro_batch_size: 1
-  generation_batch_size: 32
   logprob_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
   dtensor_cfg:
-    enabled: true
-    cpu_offload: false
     sequence_parallel: true
-    activation_checkpointing: false
     tensor_parallel_size: 4
-    context_parallel_size: 1
-    custom_parallel_plan: null
   dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
+    enabled: true
   sequence_packing:
     enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 4
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 3e-07
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
+      lr: 3.0e-07
   scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 13
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 13
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1
+      total_iters: 13
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1
+      total_iters: 10000000000
+  - milestones:
+    - 13
   generation:
-    backend: vllm
     max_new_tokens: 4096
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 151645
-    stop_strings: null
+    - 151645
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
       tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 4096
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 4096
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
index 1209040cda..a42ea746a7 100755
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
@@ -1,189 +1,56 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 64
   num_generations_per_prompt: 32
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 30
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  overlong_filtering: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
   enabled: false
   checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-megatron
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
   save_period: 100
-  checkpoint_must_save_by: null
 policy:
   model_name: Qwen/Qwen2.5-7B-Instruct
-  tokenizer:
-    name: ${policy.model_name}
-  train_global_batch_size: 512
   train_micro_batch_size: 1
-  generation_batch_size: 32
   logprob_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
   dtensor_cfg:
     enabled: false
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 0
-    activation_checkpointing: false
-    converter_type: "Qwen2ForCausalLM"
     tensor_model_parallel_size: 2
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
-    pipeline_model_parallel_size: 1
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    
-    optimizer:
-      optimizer: "adam"
-      lr: 5.0e-6
-      min_lr: 5.0e-7
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
       lr_warmup_iters: 50
-      lr_warmup_init: 5.0e-7
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-  dynamic_batching:
-    enabled: false
-  sequence_packing:
-    enabled: true
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 4
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 3e-07
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
+      lr: 3.0e-07
   scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 13
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 13
+  - name: torch.optim.lr_scheduler.LinearLR
+    kwargs:
+      start_factor: 0.1
+      end_factor: 1
+      total_iters: 13
+  - name: torch.optim.lr_scheduler.ConstantLR
+    kwargs:
+      factor: 1
+      total_iters: 10000000000
+  - milestones:
+    - 13
   generation:
-    backend: vllm
     max_new_tokens: 4096
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 151645
-    stop_strings: null
+    - 151645
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
       tensor_parallel_size: 4
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 4096
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 4096
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-megatron
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: False
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-qwen2.5-7b-instruct-4n8g-megatron
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml
index b8f79eb6ae..c417c00dbd 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -1,137 +1,31 @@
+defaults: ../../grpo_math_1B.yaml
 grpo:
-  num_prompts_per_step: 32
-  num_generations_per_prompt: 16
-  max_rollout_turns: 1
-  max_num_epochs: 1
   max_num_steps: 450
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  overlong_filtering: false
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
 checkpointing:
-  enabled: true
   checkpoint_dir: results/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
   model_name: Qwen/Qwen2.5-Math-1.5B-Instruct
   tokenizer:
     name: Qwen/Qwen2.5-Math-1.5B-Instruct
-  train_global_batch_size: 512
-  train_micro_batch_size: 4
-  generation_batch_size: 32
-  logprob_batch_size: 4
-  max_total_sequence_length: 512
-  precision: bfloat16
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
   dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
+    enabled: true
   sequence_packing:
     enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
-  optimizer:
-    name: torch.optim.AdamW
-    kwargs:
-      lr: 5e-06
-      weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.999
-      eps: 1e-08
-      foreach: false
-      fused: false
-  scheduler:
-    - name: torch.optim.lr_scheduler.LinearLR
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1
-        total_iters: 50
-    - name: torch.optim.lr_scheduler.ConstantLR
-      kwargs:
-        factor: 1
-        total_iters: 10000000000
-    - milestones:
-        - 50
   generation:
-    backend: vllm
     max_new_tokens: 512
-    temperature: 1
-    top_p: 1
-    top_k: null
     stop_token_ids:
-      - 151645
-    stop_strings: null
+    - 151645
     vllm_cfg:
-      async_engine: false
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
       max_model_len: 512
-      enforce_eager: False
-    colocated:
-      enabled: true
-      resources:
-        gpus_per_node: null
-        num_nodes: null
 data:
   max_input_seq_length: 512
-  prompt_file: examples/prompts/cot.txt
-  system_prompt_file: null
-  dataset_name: OpenMathInstruct-2
-  shuffle: true
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
index 5c7d1ed78f..de30fe287a 100755
--- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
@@ -1,155 +1,48 @@
-# GRPO Algorithm Configuration
-defaults: "../../grpo_math_1B.yaml"
-
+defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 64
   num_generations_per_prompt: 32
-  max_num_steps: 1000000
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  max_val_samples: 256
-  val_batch_size: 256
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
-  ratio_clip_c: null
 checkpointing:
   enabled: false
   checkpoint_dir: results/grpo-qwen3-30ba3b-8n8g-megatron
-  metric_name: val_reward
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
 policy:
-  model_name: "Qwen/Qwen3-30B-A3B"
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-  train_global_batch_size: 512
+  model_name: Qwen/Qwen3-30B-A3B
   train_micro_batch_size: 1
-  generation_batch_size: 32 # Only used when generating using HF backend
-  logprob_batch_size: 4
   max_total_sequence_length: 4096
-  precision: "bfloat16"
-
   dtensor_cfg:
     enabled: false
-
-  optimizer: null # remove default FSDP optimizer
-
-  scheduler: null # remove default FSDP scheduler
-
-  dynamic_batching:
-    enabled: False
+  optimizer: null
+  scheduler: null
   sequence_packing:
-    enabled: False # coming soon
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    algorithm: "modified_ffd"
-    sequence_length_round: 64
-  max_grad_norm: 1.0
+    enabled: false
+    algorithm: modified_ffd
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   megatron_cfg:
     enabled: true
     empty_unused_memory_level: 1
-    activation_checkpointing: false
     tensor_model_parallel_size: 4
     pipeline_model_parallel_size: 4
-    context_parallel_size: 1
-    expert_tensor_parallel_size: 1
     expert_model_parallel_size: 4
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: True
-    pipeline_dtype: ${policy.precision}
-    freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-
+    sequence_parallel: true
     optimizer:
-      optimizer: "adam"
-      lr: 3.0e-7
-      min_lr: 3.0e-8
-      weight_decay: 0.01
-      bf16: true
-      fp16: false
-      params_dtype: "float32"
-      clip_grad: ${policy.max_grad_norm}
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.999
-      adam_eps: 1e-8
-      #sgd
-      sgd_momentum: 0.9
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
+      lr: 3.0e-07
+      min_lr: 3.0e-08
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
       lr_warmup_iters: 50
-      lr_warmup_init: 3.0e-8
-    
+      lr_warmup_init: 3.0e-08
     env_vars:
-      PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
-    
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:False
   generation:
-    backend: "vllm"
-    max_new_tokens: ${policy.max_total_sequence_length}
-    temperature: 1.0
-    top_p: 1.0
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
     vllm_cfg:
       tensor_parallel_size: 4
       gpu_memory_utilization: 0.7
-      max_model_len: ${policy.max_total_sequence_length}
-data:
-  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
-  prompt_file: "examples/prompts/cot.txt"
-  system_prompt_file: null
-  dataset_name: "OpenMathInstruct-2"
-env:
-  math:
-    num_workers: 8
 logger:
   log_dir: logs/grpo-qwen3-30ba3b-8n8g-megatron
-  num_val_samples_to_print: 0
   wandb_enabled: true
   tensorboard_enabled: true
-  mlflow_enabled: False
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: grpo-qwen3-30ba3b-8n8g-megatron
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
   num_nodes: 8
diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
index 33435fbd15..5ffc78b136 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
@@ -1,134 +1,60 @@
+defaults: ../../sft.yaml
 sft:
-  max_num_epochs: 1
   max_num_steps: 1000000
   val_period: 500
   val_batches: 4
   val_global_batch_size: 128
-  val_micro_batch_size: 1
   val_at_start: false
-  seed: 42
 checkpointing:
-  enabled: true
   checkpoint_dir: results/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron
-  metric_name: val_loss
-  higher_is_better: false
-  keep_top_k: 3
   save_period: 100
-  checkpoint_must_save_by: null
 policy:
-  model_name: "meta-llama/Llama-3.1-70B"
+  model_name: meta-llama/Llama-3.1-70B
   tokenizer:
-    name: meta-llama/Llama-3.1-8B-Instruct ## specify if you'd like to use a tokenizer different from the model's default
+    name: meta-llama/Llama-3.1-8B-Instruct
   train_global_batch_size: 512
-  train_micro_batch_size: 1
   max_total_sequence_length: 4096
-  precision: "bfloat16"
   dtensor_cfg:
     enabled: false
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
     tensor_model_parallel_size: 4
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
     pipeline_model_parallel_size: 2
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    sequence_parallel: false
     freeze_moe_router: true
-    moe_router_dtype: "fp64"
-    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
-    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    
+    moe_router_dtype: fp64
+    moe_router_load_balancing_type: none
+    moe_router_bias_update_rate: 0.0
     optimizer:
-      optimizer: "adam"
-      lr: 2e-5
-      min_lr: 2e-5
+      lr: 2.0e-05
+      min_lr: 2.0e-05
       weight_decay: 0.01
       bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
       adam_beta2: 0.999
-      adam_eps: 1e-8
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
+      adam_eps: 1.0e-08
       clip_grad: 0.0
-
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
       lr_warmup_iters: 1
-      lr_warmup_init: 2e-5
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      use_custom_fsdp: false
-      data_parallel_sharding_strategy: "optim_grads_params"
-  dynamic_batching:
-    enabled: false
-  sequence_packing:
-    enabled: false
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
+      lr_warmup_init: 2.0e-05
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
   max_grad_norm: null
   optimizer:
-    name: "torch.optim.AdamW"
     kwargs:
-      lr: 2e-5
+      lr: 2.0e-05
       weight_decay: 0.01
-      betas: [0.9, 0.98]
-      eps: 1e-8
-      # when using Dtensor, we need to set foreach
-      # and fused to False
-      foreach: False
-      fused: False
+      eps: 1.0e-08
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: "openmathinstruct2"
+  dataset_name: openmathinstruct2
   prompt_file: examples/prompts/math.txt
-  split: "train_1M"
-  add_bos: true
-  add_eos: true
+  split: train_1M
   add_generation_prompt: true
-  output_key: 'generated_solution'
-  shuffle: true
+  output_key: generated_solution
   seed: 42
 logger:
-  log_dir: "logs"  # Base directory for all logs
-  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
-  tensorboard_enabled: true
-  mlflow_enabled: False
-  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  monitor_gpus: false
   wandb:
-    project: "sft-dev"
-    name: "openmathinstruct-nemorl-1M_train"
+    name: openmathinstruct-nemorl-1M_train
   tensorboard:
-    log_dir: "tb_logs-openmathinstruct-nemorl-1M_train"
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+    log_dir: tb_logs-openmathinstruct-nemorl-1M_train
 cluster:
   gpus_per_node: 8
   num_nodes: 8
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
index d7906b82e0..1a7e4e1994 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
@@ -1,20 +1,14 @@
+defaults: ../../sft.yaml
 sft:
-  max_num_epochs: 1
   max_num_steps: 10000
   val_period: 500
   val_batches: 4
   val_global_batch_size: 128
   val_micro_batch_size: 2
   val_at_start: false
-  seed: 42
 checkpointing:
-  enabled: true
   checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
-  metric_name: val_loss
-  higher_is_better: false
-  keep_top_k: 3
   save_period: 50
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
@@ -22,59 +16,29 @@ policy:
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
   dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
-    activation_checkpointing: false
     tensor_parallel_size: 4
-    context_parallel_size: 1
-    custom_parallel_plan: null
   dynamic_batching:
     enabled: true
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    sequence_length_round: 64
-  sequence_packing:
-    enabled: false
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 2e-5
+      lr: 2.0e-05
       weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.98
-      eps: 1e-08
-      foreach: false
-      fused: false
+      eps: 1.0e-08
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: "openmathinstruct2"
+  dataset_name: openmathinstruct2
   prompt_file: examples/prompts/math.txt
-  split: "train_1M"
-  add_bos: true
-  add_eos: true
+  split: train_1M
   add_generation_prompt: true
-  output_key: 'generated_solution'
+  output_key: generated_solution
   seed: 42
-  shuffle: true
 logger:
   log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
-  wandb_enabled: true
-  tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
   tensorboard:
     log_dir: tb_logs-sft-dev-squad
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
index 1fc0ccec7c..dc4a671fec 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
@@ -1,20 +1,14 @@
+defaults: ../../sft.yaml
 sft:
-  max_num_epochs: 1
   max_num_steps: 10000
   val_period: 500
   val_batches: 4
   val_global_batch_size: 128
   val_micro_batch_size: 2
   val_at_start: false
-  seed: 42
 checkpointing:
-  enabled: true
   checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
-  metric_name: val_loss
-  higher_is_better: false
-  keep_top_k: 3
   save_period: 100
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
@@ -22,63 +16,25 @@ policy:
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
-  dynamic_batching:
-    enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    sequence_length_round: 64
-  sequence_packing:
-    enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 2e-5
+      lr: 2.0e-05
       weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.98
-      eps: 1e-08
-      foreach: false
-      fused: false
+      eps: 1.0e-08
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: "openmathinstruct2"
+  dataset_name: openmathinstruct2
   prompt_file: examples/prompts/math.txt
-  split: "train_1M"
-  add_bos: true
-  add_eos: true
+  split: train_1M
   add_generation_prompt: true
-  output_key: 'generated_solution'
-  shuffle: true
+  output_key: generated_solution
   seed: 42
 logger:
   log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
-  wandb_enabled: true
-  tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
   tensorboard:
     log_dir: tb_logs-sft-dev-squad
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
-
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
index 8c3f14b531..f4c0296977 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
@@ -1,20 +1,10 @@
+defaults: ../../sft.yaml
 sft:
-  max_num_epochs: 1
   max_num_steps: 350
   val_period: 500
-  val_batches: 8
-  val_global_batch_size: 32
-  val_micro_batch_size: 1
-  val_at_start: true
-  seed: 42
 checkpointing:
-  enabled: true
   checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
-  metric_name: val_loss
-  higher_is_better: false
-  keep_top_k: 3
   save_period: 20
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
@@ -22,60 +12,28 @@ policy:
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
   dtensor_cfg:
-    enabled: true
-    cpu_offload: false
     sequence_parallel: true
-    activation_checkpointing: false
     tensor_parallel_size: 2
-    context_parallel_size: 1
-    custom_parallel_plan: null
-  dynamic_batching:
-    enabled: false
-  sequence_packing:
-    enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 2
-  max_grad_norm: 1
   optimizer:
-    name: torch.optim.AdamW
     kwargs:
-      lr: 2e-5
+      lr: 2.0e-05
       weight_decay: 0.01
-      betas:
-        - 0.9
-        - 0.98
-      eps: 1e-08
-      foreach: false
-      fused: false
+      eps: 1.0e-08
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: "openmathinstruct2"
+  dataset_name: openmathinstruct2
   prompt_file: examples/prompts/math.txt
-  split: "train_1M"
-  add_bos: true
-  add_eos: true
+  split: train_1M
   add_generation_prompt: true
-  output_key: 'generated_solution'
-  shuffle: true
+  output_key: generated_solution
   seed: 42
 logger:
   log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
-  wandb_enabled: true
-  tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
   tensorboard:
     log_dir: tb_logs-sft-dev-openmathinstruct2
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
index 0bb610fff3..43b351cd34 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
@@ -1,20 +1,10 @@
+defaults: ../../sft.yaml
 sft:
-  max_num_epochs: 1
   max_num_steps: 250
   val_period: 500
-  val_batches: 8
-  val_global_batch_size: 32
-  val_micro_batch_size: 1
-  val_at_start: true
-  seed: 42
 checkpointing:
-  enabled: true
   checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-megatron
-  metric_name: val_loss
-  higher_is_better: false
-  keep_top_k: 3
   save_period: 50
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
@@ -22,105 +12,36 @@ policy:
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
   dtensor_cfg:
     enabled: false
-  dynamic_batching:
-    enabled: false
   sequence_packing:
     enabled: true
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
-  max_grad_norm: 1
   optimizer: null
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
     tensor_model_parallel_size: 2
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
     pipeline_model_parallel_size: 2
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: false
-    freeze_moe_router: false
-    moe_router_dtype: null
-    moe_router_load_balancing_type: "aux_loss"
-    moe_router_bias_update_rate: 1e-3
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    
     optimizer:
-      optimizer: "adam"
-      lr: 2.0e-5
-      min_lr: 1.99999e-5
+      lr: 2.0e-05
+      min_lr: 1.99999e-05
       weight_decay: 0.01
       bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-5
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 50
       lr_warmup_init: 1.9999e-65
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: "openmathinstruct2"
+  dataset_name: openmathinstruct2
   prompt_file: examples/prompts/math.txt
-  split: "train_1M"
-  add_bos: true
-  add_eos: true
+  split: train_1M
   add_generation_prompt: true
-  output_key: 'generated_solution'
+  output_key: generated_solution
   seed: 42
-  shuffle: true
 logger:
   log_dir: logs/sft-llama3.1-8b-1n8g-megatron
-  wandb_enabled: true
-  tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: sft-llama3.1-8b-1n8g-megatron
   tensorboard:
     log_dir: tb_logs-sft-dev-openmathinstruct2
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
index 648f45ab12..e68a1e9792 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
@@ -1,20 +1,10 @@
+defaults: ../../sft.yaml
 sft:
-  max_num_epochs: 1
   max_num_steps: 250
   val_period: 500
-  val_batches: 8
-  val_global_batch_size: 32
-  val_micro_batch_size: 1
-  val_at_start: true
-  seed: 42
 checkpointing:
-  enabled: true
   checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-megatron
-  metric_name: val_loss
-  higher_is_better: false
-  keep_top_k: 3
   save_period: 100
-  checkpoint_must_save_by: null
 policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
@@ -22,105 +12,34 @@ policy:
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
-  precision: bfloat16
   dtensor_cfg:
     enabled: false
-  dynamic_batching:
-    enabled: false
-  sequence_packing:
-    enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: ${policy.megatron_cfg.tensor_model_parallel_size}
-  max_grad_norm: 1
   optimizer: null
   megatron_cfg:
     enabled: true
-    empty_unused_memory_level: 1
-    activation_checkpointing: false
     tensor_model_parallel_size: 2
-    expert_tensor_parallel_size: 1
-    expert_model_parallel_size: 1
     pipeline_model_parallel_size: 2
-    context_parallel_size: 1
-    pipeline_dtype: ${policy.precision}
-    num_layers_in_first_pipeline_stage: null
-    num_layers_in_last_pipeline_stage: null
-    sequence_parallel: false
-    freeze_moe_router: false
-    moe_router_dtype: null
-    moe_router_load_balancing_type: "aux_loss"
-    moe_router_bias_update_rate: 1e-3
-    moe_permute_fusion: false
-    #gives ~20% training perf speedup with sequence packing 
-    apply_rope_fusion: True
-    
     optimizer:
-      optimizer: "adam"
-      lr: 2.0e-5
-      min_lr: 1.99999e-5
+      lr: 2.0e-05
+      min_lr: 1.99999e-05
       weight_decay: 0.01
       bf16: true
-      fp16: false
-      params_dtype: "float32"
-
-      #adam
-      adam_beta1: 0.9
-      adam_beta2: 0.98
-      adam_eps: 1e-5
-
-      #sgd
-      sgd_momentum: 0.9
-
-      #distributed optimizer
-      use_distributed_optimizer: true
-      use_precision_aware_optimizer: true
-
-      clip_grad: ${policy.max_grad_norm}
-
     scheduler:
-      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
-      weight_decay_incr_style: "constant"
-      lr_decay_style: "constant"
-      lr_decay_iters: 1000
-      lr_warmup_iters: 50
       lr_warmup_init: 1.9999e-65
-
-    distributed_data_parallel_config:
-      grad_reduce_in_fp32: false
-      overlap_grad_reduce: true
-      overlap_param_gather: true
-      average_in_collective: true
-      data_parallel_sharding_strategy: "optim_grads_params"
-
-
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: "openmathinstruct2"
+  dataset_name: openmathinstruct2
   prompt_file: examples/prompts/math.txt
-  split: "train_1M"
-  add_bos: true
-  add_eos: true
+  split: train_1M
   add_generation_prompt: true
-  output_key: 'generated_solution'
-  shuffle: true
+  output_key: generated_solution
   seed: 42
 logger:
   log_dir: logs/sft-llama3.1-8b-1n8g-megatron
-  wandb_enabled: true
-  tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: sft-llama3.1-8b-1n8g-megatron
   tensorboard:
     log_dir: tb_logs-sft-dev-openmathinstruct2
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml
index 165e2fa9a3..77ff8aac89 100644
--- a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.v3.yaml
@@ -1,82 +1,26 @@
+defaults: ../../sft.yaml
 sft:
-  max_num_epochs: 1
   max_num_steps: 500
-  val_period: 10
-  val_batches: 8
-  val_global_batch_size: 32
-  val_micro_batch_size: 1
-  val_at_start: true
-  seed: 42
 checkpointing:
-  enabled: true
   checkpoint_dir: results/sft-llama3.2-1b-1n8g-fsdp2tp1
-  metric_name: val_loss
-  higher_is_better: false
-  keep_top_k: 3
   save_period: 100
-  checkpoint_must_save_by: null
 policy:
-  model_name: meta-llama/Llama-3.2-1B
   tokenizer:
     name: meta-llama/Llama-3.2-1B
-    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
-  train_global_batch_size: 32
-  train_micro_batch_size: 1
-  max_total_sequence_length: 1024
-  precision: bfloat16
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: false
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
-  dynamic_batching:
-    enabled: false
-  sequence_packing:
-    enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 1
-  max_grad_norm: 1
-  optimizer:
-    name: torch.optim.AdamW
-    kwargs:
-      lr: 5e-06
-      weight_decay: 0.1
-      betas:
-        - 0.9
-        - 0.98
-      eps: 1e-05
-      foreach: false
-      fused: false
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: "openmathinstruct2"
+  dataset_name: openmathinstruct2
   prompt_file: examples/prompts/math.txt
-  split: "train_1M"
-  add_bos: true
-  add_eos: true
+  split: train_1M
   add_generation_prompt: true
-  output_key: 'generated_solution'
-  shuffle: true
+  output_key: generated_solution
   seed: 42
 logger:
   log_dir: logs/sft-llama3.2-1b-1n8g-fsdp2tp1
-  wandb_enabled: true
-  tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: sft-llama3.2-1b-1n8g-fsdp2tp1
   tensorboard:
     log_dir: tb_logs-sft-dev-openmathinstruct2
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml
index 800d94711e..c94683c61f 100644
--- a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml
+++ b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v3.yaml
@@ -1,81 +1,32 @@
+defaults: ../../sft.yaml
 sft:
-  max_num_epochs: 1
   max_num_steps: 20
-  val_period: 10
-  val_batches: 8
-  val_global_batch_size: 32
-  val_micro_batch_size: 1
-  val_at_start: true
-  seed: 42
 checkpointing:
-  enabled: true
   checkpoint_dir: results/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
-  metric_name: val_loss
-  higher_is_better: false
-  keep_top_k: 3
   save_period: 100
-  checkpoint_must_save_by: null
 policy:
   model_name: Qwen/Qwen2.5-32B
   tokenizer:
     name: Qwen/Qwen2.5-32B
-    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
-  train_global_batch_size: 32
-  train_micro_batch_size: 1
   max_total_sequence_length: 16000
-  precision: bfloat16
   dtensor_cfg:
-    enabled: true
-    cpu_offload: false
     sequence_parallel: true
     activation_checkpointing: true
     tensor_parallel_size: 8
-    context_parallel_size: 1
-    custom_parallel_plan: null
-  dynamic_batching:
-    enabled: false
-  sequence_packing:
-    enabled: false
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    algorithm: "modified_first_fit_decreasing"
-    sequence_length_round: 64
   make_sequence_length_divisible_by: 8
-  max_grad_norm: 1
-  optimizer:
-    name: torch.optim.AdamW
-    kwargs:
-      lr: 5e-06
-      weight_decay: 0.1
-      betas:
-        - 0.9
-        - 0.98
-      eps: 1e-05
-      foreach: false
-      fused: false
 data:
-  max_input_seq_length: ${policy.max_total_sequence_length}
-  dataset_name: "openmathinstruct2"
+  dataset_name: openmathinstruct2
   prompt_file: examples/prompts/math.txt
-  split: "train_1M"
-  add_bos: true
-  add_eos: true
+  split: train_1M
   add_generation_prompt: true
-  output_key: 'generated_solution'
-  shuffle: true
+  output_key: generated_solution
 logger:
   log_dir: logs/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
-  wandb_enabled: true
-  tensorboard_enabled: true
-  mlflow_enabled: false
-  monitor_gpus: true
   wandb:
     project: nemo-rl
     name: sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
   tensorboard:
     log_dir: tb_logs-sft-dev-openmathinstruct2
-  gpu_monitoring:
-    collection_interval: 10
-    flush_interval: 10
 cluster:
   gpus_per_node: 8
   num_nodes: 4
diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml
index 3f744e1a30..2d39d9cd7f 100644
--- a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml
+++ b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml
@@ -1,180 +1,14 @@
-# GRPO Algorithm Configuration
-grpo:
-  num_prompts_per_step: 8
-  num_generations_per_prompt: 16
-  max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
-  max_num_epochs: 1
-  max_num_steps: 1000000
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  overlong_filtering: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
-
+defaults: ../../vlm_grpo_3B.yaml
 checkpointing:
-  enabled: true
-  checkpoint_dir: "results/clevr_grpo"
-  metric_name: "val_reward"
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
-
+  checkpoint_dir: results/clevr_grpo
 policy:
-  model_name: "Qwen/Qwen2.5-VL-3B-Instruct"
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-  train_global_batch_size: 128
-  train_micro_batch_size: 1
-  generation_batch_size: 32 # Only used when generating using HF backend
-  logprob_batch_size: 4
   max_total_sequence_length: 3072
-  precision: "bfloat16"
-
-  dtensor_cfg:
-    _v2: true
-    enabled: true
-    cpu_offload: False
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
-
-  # dynamic_batching improves performance by ensuring logprob and training microbatches
-  # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
-  # responses are sorted by sequence length and bucketed into microbatches with a total
-  # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the
-  # training and logprob stages respectively.
-  dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
-
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
-  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 1.0
-
-  sequence_packing:
-    enabled: False
-
-  optimizer:
-    name: "torch.optim.AdamW"
-    kwargs:
-      lr: 5.0e-7
-      weight_decay: 0.01
-      betas: [0.9, 0.999]
-      eps: 1e-8
-      # when using Dtensor, we need to set foreach
-      # and fused to False
-      foreach: False
-      fused: False
-
-  scheduler:
-    - name: "torch.optim.lr_scheduler.LinearLR"
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1.0
-        total_iters: 50
-    - name: "torch.optim.lr_scheduler.ConstantLR"
-      kwargs:
-        factor: 1.0
-        total_iters: 10000000000
-    - milestones: [50]
-
-  generation:
-    backend: "vllm"
-    # max_new_tokens: ${policy.max_total_sequence_length}
-    max_new_tokens: 1024
-    temperature: 1.0
-    top_p: 1.0
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
-    vllm_cfg:
-      async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447.
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
-      max_model_len: ${policy.max_total_sequence_length}
-      enforce_eager: False
-    colocated:
-      # true: generation shares training GPUs
-      # false: uses dedicated generation resources
-      enabled: true
-      # only relevant when enabled is false
-      resources:
-        gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
-        num_nodes: null # Decides number of nodes to be dedicated to generation
-
-data:
-  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
-  prompt_file: "examples/prompts/clevr_cogent_cot.txt"
-  system_prompt_file: null
-  dataset_name: "clevr-cogent"
-  split: "trainA"
-  shuffle: true
-
 env:
-  clevr-cogent:
-    num_workers: 8
-    reward_functions:
-      - name: format
-        weight: 0.2
-      - name: exact_alnum
-        weight: 0.8
-  geometry3k:
-    num_workers: 8
-    reward_functions:
-      - name: format
-        weight: 0.1
-      - name: math_expr
-        weight: 0.9
   refcoco:
-    num_workers: 8
     reward_functions:
-      - name: format
-        weight: 0.1
-      - name: bbox_giou
-        weight: 0.9
-        kwargs: 
-          giou_penalty_thres: 1.0  # (apply giou penalty if iou < giou_penalty_thres; anything less than 0 means use iou only (since the condition iou < 0 is not possible))
-
-logger:
-  log_dir: "logs"  # Base directory for all logs
-  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
-  wandb_enabled: false
-  tensorboard_enabled: true
-  mlflow_enabled: false  # Disable MLflow logging
-  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  wandb:
-    project: "grpo-dev"
-    name: "grpo-dev-logger"
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
-cluster:
-  gpus_per_node: 2
-  num_nodes: 1
+    - name: format
+      weight: 0.1
+    - name: bbox_giou
+      weight: 0.9
+      kwargs:
+        giou_penalty_thres: 1.0
diff --git a/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml
index 66feabee46..15ef079582 100644
--- a/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml
+++ b/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml
@@ -1,179 +1,6 @@
-# GRPO Algorithm Configuration
-grpo:
-  num_prompts_per_step: 8
-  num_generations_per_prompt: 16
-  max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
-  max_num_epochs: 1
-  max_num_steps: 1000000
-  normalize_rewards: true
-  use_leave_one_out_baseline: true
-  val_period: 10
-  val_at_start: false
-  overlong_filtering: false
-  max_val_samples: 256
-  val_batch_size: 256
-  seed: 42
-  async_grpo:
-    enabled: false
-    max_trajectory_age_steps: 1
-
-loss_fn:
-  reference_policy_kl_penalty: 0.01
-  ratio_clip_min: 0.2
-  ratio_clip_max: 0.2
-  ratio_clip_c: null
-  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
-  use_on_policy_kl_approximation: false
-  use_importance_sampling_correction: false
-  token_level_loss: true
-
+defaults: ../../vlm_grpo_3B.yaml
 checkpointing:
-  enabled: true
-  checkpoint_dir: "results/clevr_grpo"
-  metric_name: "val_reward"
-  higher_is_better: true
-  keep_top_k: 3
-  save_period: 10
-  checkpoint_must_save_by: null
-
+  checkpoint_dir: results/clevr_grpo
 policy:
-  model_name: "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
-  tokenizer:
-    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-  train_global_batch_size: 128
-  train_micro_batch_size: 1
-  generation_batch_size: 32 # Only used when generating using HF backend
-  logprob_batch_size: 4
+  model_name: HuggingFaceTB/SmolVLM2-2.2B-Instruct
   max_total_sequence_length: 3072
-  precision: "bfloat16"
-
-  dtensor_cfg:
-    enabled: true
-    cpu_offload: False
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    custom_parallel_plan: null
-
-  # dynamic_batching improves performance by ensuring logprob and training microbatches
-  # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
-  # responses are sorted by sequence length and bucketed into microbatches with a total
-  # amount of tokens is approximately close to 'train_mb_tokens' and 'logprob_mb_tokens' for the
-  # training and logprob stages respectively.
-  dynamic_batching:
-    enabled: True
-    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
-    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
-    sequence_length_round: 64
-
-  # makes the training sequence length divisible by the tensor parallel size
-  # this is useful for sequence parallel training
-  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
-  max_grad_norm: 1.0
-
-  sequence_packing:
-    enabled: False
-
-  optimizer:
-    name: "torch.optim.AdamW"
-    kwargs:
-      lr: 5.0e-7
-      weight_decay: 0.01
-      betas: [0.9, 0.999]
-      eps: 1e-8
-      # when using Dtensor, we need to set foreach
-      # and fused to False
-      foreach: False
-      fused: False
-
-  scheduler:
-    - name: "torch.optim.lr_scheduler.LinearLR"
-      kwargs:
-        start_factor: 0.1
-        end_factor: 1.0
-        total_iters: 50
-    - name: "torch.optim.lr_scheduler.ConstantLR"
-      kwargs:
-        factor: 1.0
-        total_iters: 10000000000
-    - milestones: [50]
-
-  generation:
-    backend: "vllm"
-    # max_new_tokens: ${policy.max_total_sequence_length}
-    max_new_tokens: 1024
-    temperature: 1.0
-    top_p: 1.0
-    top_k: null
-    stop_token_ids: null
-    stop_strings: null
-    vllm_cfg:
-      async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447.
-      precision: ${policy.precision}
-      tensor_parallel_size: 1
-      pipeline_parallel_size: 1
-      expert_parallel_size: 1
-      gpu_memory_utilization: 0.6
-      max_model_len: ${policy.max_total_sequence_length}
-      enforce_eager: False
-    colocated:
-      # true: generation shares training GPUs
-      # false: uses dedicated generation resources
-      enabled: true
-      # only relevant when enabled is false
-      resources:
-        gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
-        num_nodes: null # Decides number of nodes to be dedicated to generation
-
-data:
-  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
-  prompt_file: "examples/prompts/clevr_cogent_cot.txt"
-  system_prompt_file: null
-  dataset_name: "clevr-cogent"
-  split: "trainA"
-  shuffle: true
-
-env:
-  clevr-cogent:
-    num_workers: 8
-    reward_functions:
-      - name: format
-        weight: 0.2
-      - name: exact_alnum
-        weight: 0.8
-  geometry3k:
-    num_workers: 8
-    reward_functions:
-      - name: format
-        weight: 0.1
-      - name: math_expr
-        weight: 0.9
-  refcoco:
-    num_workers: 8
-    reward_functions:
-      - name: format
-        weight: 0.1
-      - name: bbox_giou
-        weight: 0.9
-        kwargs: 
-          giou_penalty_thres: 0.5
-
-logger:
-  log_dir: "logs"  # Base directory for all logs
-  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
-  wandb_enabled: false
-  tensorboard_enabled: true
-  mlflow_enabled: false  # Disable MLflow logging
-  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
-  wandb:
-    project: "grpo-dev"
-    name: "grpo-dev-logger"
-  tensorboard: {}
-  gpu_monitoring:
-    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
-    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
-
-cluster:
-  gpus_per_node: 2
-  num_nodes: 1

From 6d52f03faa474f7072bd1c65d661882912b69033 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 24 Sep 2025 17:21:04 +0000
Subject: [PATCH 05/15] sft default chat template

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .../recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml  | 1 +
 .../recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml  | 1 +
 .../configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml  | 1 +
 .../configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml     | 1 +
 .../recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml       | 1 +
 examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml  | 1 +
 6 files changed, 6 insertions(+)

diff --git a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
index 5ffc78b136..37e3bff33c 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-70b-8n8g-tp4pp2-long-megatron.yaml
@@ -12,6 +12,7 @@ policy:
   model_name: meta-llama/Llama-3.1-70B
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: default
   train_global_batch_size: 512
   max_total_sequence_length: 4096
   dtensor_cfg:
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
index 1a7e4e1994..88d446283d 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-dynamicbatch.yaml
@@ -13,6 +13,7 @@ policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: default
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
index dc4a671fec..86db9da5e0 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp1-long.yaml
@@ -13,6 +13,7 @@ policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: default
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
index f4c0296977..d78e0d421a 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-fsdp2tp2sp.yaml
@@ -9,6 +9,7 @@ policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: default
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
index 43b351cd34..5deed14cb4 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron-seqpack.yaml
@@ -9,6 +9,7 @@ policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: default
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
index e68a1e9792..daf5cd5393 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-1n8g-megatron.yaml
@@ -9,6 +9,7 @@ policy:
   model_name: meta-llama/Llama-3.1-8B
   tokenizer:
     name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: default
   train_global_batch_size: 512
   train_micro_batch_size: 2
   max_total_sequence_length: 4096

From 79d221b9c6bd017abd81aa8c04de9bf305ab8a24 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 24 Sep 2025 18:07:52 +0000
Subject: [PATCH 06/15] v2 select configs

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml   | 5 +++--
 examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml   | 5 +++--
 examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml    | 1 +
 .../llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml   | 1 +
 .../configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml    | 1 +
 5 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml
index 8fc6eccbdd..ff37a2db01 100644
--- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml
+++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-16K.yaml
@@ -1,6 +1,6 @@
 defaults:
-- ../../grpo_math_1B.yaml
-- grpo-deepscaler-1.5b-8K.yaml
+  - ../../grpo_math_1B.yaml
+  - grpo-deepscaler-1.5b-8K.yaml
 loss_fn:
   reference_policy_kl_penalty: 0.001
   ratio_clip_max: 0.28
@@ -12,3 +12,4 @@ policy:
     sequence_parallel: true
     activation_checkpointing: true
     tensor_parallel_size: 2
+    _v2: false
diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml
index 2bf34c47d1..eec67f1340 100644
--- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml
+++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml
@@ -1,6 +1,6 @@
 defaults:
-- ../../grpo_math_1B.yaml
-- grpo-deepscaler-1.5b-8K.yaml
+  - ../../grpo_math_1B.yaml
+  - grpo-deepscaler-1.5b-8K.yaml
 loss_fn:
   reference_policy_kl_penalty: 0.0001
   ratio_clip_max: 0.28
@@ -12,6 +12,7 @@ policy:
     sequence_parallel: true
     activation_checkpointing: true
     tensor_parallel_size: 2
+    _v2: false
   sequence_packing:
     enabled: false
   optimizer:
diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml
index 48d3317e81..46193f04da 100644
--- a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml
+++ b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml
@@ -17,6 +17,7 @@ policy:
     cpu_offload: true
     sequence_parallel: true
     activation_checkpointing: true
+    _v2: false
   sequence_packing:
     enabled: false
   optimizer:
diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml
index c50ea4834b..8b0157d2d3 100644
--- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml
+++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-8n8g-fsdp2tp8-actckpt-long.yaml
@@ -15,6 +15,7 @@ policy:
   dtensor_cfg:
     activation_checkpointing: true
     tensor_parallel_size: 8
+    _v2: false
   sequence_packing:
     enabled: false
   make_sequence_length_divisible_by: 8
diff --git a/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml
index 547b4c4382..0f410c436a 100644
--- a/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml
+++ b/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml
@@ -19,6 +19,7 @@ policy:
     cpu_offload: true
     sequence_parallel: true
     activation_checkpointing: true
+    _v2: false
   sequence_packing:
     enabled: false
   optimizer:

From a844f62b008ebb3ff2f036e341590e3c38d7b754 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 24 Sep 2025 18:10:14 +0000
Subject: [PATCH 07/15] copyright

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tools/config_cli.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/config_cli.py b/tools/config_cli.py
index 14010d8d43..8656de96db 100755
--- a/tools/config_cli.py
+++ b/tools/config_cli.py
@@ -4,6 +4,19 @@
 #   "omegaconf"
 # ]
 # ///
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Utilities for working with YAML configs in this repo.
 
 Subcommands:

From 3f3016df439dbe94fba1294233dbf4c093e0fe4a Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Wed, 24 Sep 2025 11:36:15 -0700
Subject: [PATCH 08/15] Update
 examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
Signed-off-by: Terry Kong <terrycurtiskong@gmail.com>
---
 .../llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
index 78c3e80336..8b3a43ea28 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
@@ -28,7 +28,7 @@ logger:
   tensorboard_enabled: true
   wandb:
     project: nemo-rl
-    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1
+    name: dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick
 cluster:
   gpus_per_node: 8
   num_nodes: 4

From d67058d6e4cfcb71ab0e036836756ef045aa6af0 Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Wed, 24 Sep 2025 11:36:38 -0700
Subject: [PATCH 09/15] Update
 examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
Signed-off-by: Terry Kong <terrycurtiskong@gmail.com>
---
 .../recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
index 72ac01081d..8df4bc3fb0 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml
@@ -26,7 +26,7 @@ logger:
   tensorboard_enabled: true
   wandb:
     project: nemo-rl
-    name: dpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-megatron
+    name: dpo-llama3.1-8b-instruct-4n8g-megatron-tp4.v2
 cluster:
   gpus_per_node: 8
   num_nodes: 4

From fb5ced87ae4c651ffa9d7892c627f2f7e6140919 Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Wed, 24 Sep 2025 11:38:03 -0700
Subject: [PATCH 10/15] Update
 examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
Signed-off-by: Terry Kong <terrycurtiskong@gmail.com>
---
 .../configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
index 15ca65c8f9..bc3b7fcb3d 100644
--- a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
+++ b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
@@ -26,4 +26,5 @@ logger:
     project: nemo-rl
     name: grpo-gemma3-1b-it-1n8g-fsdp2tp1
 cluster:
+  num_nodes: 1
   gpus_per_node: 8

From 661f48ae3203bca580155a9c0b2394e652fc1146 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 24 Sep 2025 18:40:22 +0000
Subject: [PATCH 11/15] revert

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .../configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
index bc3b7fcb3d..15ca65c8f9 100644
--- a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
+++ b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
@@ -26,5 +26,4 @@ logger:
     project: nemo-rl
     name: grpo-gemma3-1b-it-1n8g-fsdp2tp1
 cluster:
-  num_nodes: 1
   gpus_per_node: 8

From a236134b9a12d839e8a502e2c238ac23a486e913 Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Wed, 24 Sep 2025 14:33:43 -0700
Subject: [PATCH 12/15] Update tools/config_cli.py

Co-authored-by: Yi-Fu Wu <yifu.wu@gmail.com>
Signed-off-by: Terry Kong <terrycurtiskong@gmail.com>
---
 tools/config_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/config_cli.py b/tools/config_cli.py
index 8656de96db..9defc451eb 100755
--- a/tools/config_cli.py
+++ b/tools/config_cli.py
@@ -28,7 +28,7 @@
   - minimize-check: Same args as `minimize` but only checks if minimization
     would change the file; exits non-zero if changes are needed.
 
-Both commands support printing to stdout or in-place editing of the config file.
+The `expand` and `minimize` commands support printing to stdout or in-place editing of the config file.
 
 Example:
   # Expand a config with a root level "defaults" key to see the full config; print to stdout

From 8b59d232b88f5630b66aa43652fe4b26e7e7b6d5 Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Wed, 24 Sep 2025 15:17:48 -0700
Subject: [PATCH 13/15] Update .pre-commit-config.yaml

Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 .pre-commit-config.yaml | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b09f6cceb3..cc02d93e42 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -47,14 +47,16 @@ repos:
         additional_dependencies: []
         minimum_pre_commit_version: "2.9.2"
 
-  # The rationale behind this pre-commit hook is that we want to ensure the config is minimized and matches
-  # what you want merge in early otherwise you risk running one experiment, but when you merge the config
-  # into upstream, you'll merge with the base config and that could be an experiment with different hyperparameters.
-  # Anecdotally, this has been an issue when a SFT recipe runs without a custom chat_template, but when it merges with
-  # the default one, it gets our recommended chat_template which is not what comes from the config.
+  # This pre-commit hook ensures that the config file is minimized and reflects exactly what you
+  # intend to merge. Without it, you might run experiments with one config, but when merging upstream,
+  # the config could silently fall back to the base defaults—resulting in different hyperparameters.
   #
-  # You can disable this pre-commit hook if you find this disruptive, but we will expect that the config
-  # is minimized before accepting the recipe upstream.
+  # For example, we’ve seen cases where an SFT recipe runs without a custom chat_template. When merged,
+  # it unexpectedly picks up the default recommended chat_template from upstream, which doesn’t match
+  # the original experiment setup.
+  #
+  # If this check is disruptive, you can disable the pre-commit hook locally. However, before a recipe
+  # is accepted upstream, we expect the config to be minimized.
   - repo: local
     hooks:
       - id: configs-minimize-check-llm

From 044385cfe327626840d710eb51864591652ba031 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 25 Sep 2025 06:50:08 +0000
Subject: [PATCH 14/15] unit tests

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/tools/test_config_cli.py | 267 ++++++++++++++++++++++++++++
 tools/config_cli.py                 | 126 +++++++++----
 2 files changed, 361 insertions(+), 32 deletions(-)
 create mode 100644 tests/unit/tools/test_config_cli.py

diff --git a/tests/unit/tools/test_config_cli.py b/tests/unit/tools/test_config_cli.py
new file mode 100644
index 0000000000..805b5a2a5a
--- /dev/null
+++ b/tests/unit/tools/test_config_cli.py
@@ -0,0 +1,267 @@
+import importlib.util
+import inspect
+import os
+from pathlib import Path
+from textwrap import dedent
+from typing import Any
+
+import pytest
+from omegaconf import OmegaConf
+
+
+def _load_cli_module() -> Any:
+    # Use a path relative to this test file to import tools/config_cli.py
+    test_file = Path(__file__).resolve()
+    repo_root = test_file.parents[3]
+    cli_path = repo_root / "tools" / "config_cli.py"
+    assert cli_path.exists(), f"Expected CLI at {cli_path}"
+    spec = importlib.util.spec_from_file_location("config_cli", str(cli_path))
+    assert spec and spec.loader
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)  # type: ignore[arg-type]
+    return module
+
+
+@pytest.fixture(scope="module")
+def cli() -> Any:
+    return _load_cli_module()
+
+
+def test__resolve_path_absolute_and_relative(cli: Any, tmp_path: Path) -> None:
+    base = tmp_path
+    # absolute input stays absolute
+    abs_in = "/etc/hosts"
+    assert str(cli.resolve_path(base, abs_in)) == abs_in
+    # relative input resolves against base
+    rel_in = "sub/dir/file.yaml"
+    expected = (base / rel_in).resolve()
+    assert cli.resolve_path(base, rel_in) == expected
+
+
+def test__prune_equal_basic(cli: Any) -> None:
+    # Dict pruning: remove keys equal to base, keep differences
+    a = {"a": 1, "b": {"c": 2, "d": 3}}
+    b = {"a": 1, "b": {"c": 9, "d": 3}}
+    out = cli._prune_equal(a, b)
+    assert out == {"b": {"c": 2}}
+
+    # List pruning: equal lists of same length return REMOVE sentinel
+    a_list = [1, 2, 3]
+    b_list = [1, 2, 3]
+    out_list = cli._prune_equal(a_list, b_list)
+    assert out_list is cli.REMOVE
+
+    # Base-type equality returns REMOVE
+    assert cli._prune_equal(5, 5) is cli.REMOVE
+    # Different base-types keep original
+    assert cli._prune_equal(5, 6) == 5
+
+
+def test__ensure_defaults_relative_variants(cli: Any, tmp_path: Path) -> None:
+    base = tmp_path / "configs" / "base.yaml"
+    child = tmp_path / "recipes" / "child.yaml"
+    child.parent.mkdir(parents=True, exist_ok=True)
+    base.parent.mkdir(parents=True, exist_ok=True)
+    base.write_text("base: true\n")
+    child.write_text("child: true\n")
+
+    # Case 1: no defaults in child
+    cfg: dict[str, Any] = {"child": True}
+    cli._ensure_defaults_relative(child, base, cfg)
+    rel = os.path.relpath(str(base), start=str(child.parent))
+    assert cfg["defaults"] == rel
+
+    # Case 2: defaults as string (ensure base inserted first if missing)
+    cfg2: dict[str, Any] = {"defaults": "something.yaml"}
+    cli._ensure_defaults_relative(child, base, cfg2)
+    val = cfg2["defaults"]
+    if isinstance(val, list):
+        assert val[0] == rel
+    else:
+        # collapsed to a string only if single element
+        assert val == rel or val == "something.yaml"
+
+    # Case 3: defaults list, ensure base is present and order preserved otherwise
+    cfg3: dict[str, Any] = {"defaults": ["x.yaml", "y.yaml"]}
+    cli._ensure_defaults_relative(child, base, cfg3)
+    assert isinstance(cfg3["defaults"], list)
+    assert cfg3["defaults"][0] == rel
+
+
+def test_minimize_in_place_and_check(
+    cli: Any, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    base = tmp_path / "base.yaml"
+    child = tmp_path / "child.yaml"
+    base.write_text(
+        dedent(
+            """
+            common:
+              a: 1
+              list: [1, 2]
+              nested:
+                x: 0
+            top_only: 7
+            """
+        ).strip()
+    )
+    child.write_text(
+        dedent(
+            """
+            defaults: parent.yaml
+            common:
+              a: 1
+              list: [1, 2]
+              nested:
+                x: 1
+            new_top: 42
+            """
+        ).strip()
+    )
+
+    # Before minimizing, check should fail
+    ns = type("NS", (), {"base": str(base), "config": str(child)})
+    ret = cli.minimize_check(ns)
+    assert ret == 1
+    err = capsys.readouterr().err
+    assert "Suggested fix" in err
+
+    # Minimize in place
+    ns2 = type("NS", (), {"base": str(base), "config": str(child), "in_place": True})
+    ret2 = cli.minimize(ns2)
+    assert ret2 == 0
+    minimized = child.read_text().strip()
+    rel = os.path.relpath(str(base), start=str(child.parent))
+    assert minimized.splitlines()[0].startswith("defaults:")
+    assert rel in minimized
+    # Ensure pruned keys are gone and differences stay
+    assert "top_only" not in minimized
+    assert "new_top" in minimized
+    assert "nested:\n  x: 1" in minimized.replace(
+        "\r\n", "\n"
+    ) or "nested:\n    x: 1" in minimized.replace("\r\n", "\n")
+
+    # After minimizing, check should pass
+    ret3 = cli.minimize_check(ns)
+    assert ret3 == 0
+
+
+def test_expand_and_compare(
+    cli: Any, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    parent = tmp_path / "parent.yaml"
+    child = tmp_path / "child.yaml"
+    parent.write_text(
+        dedent(
+            """
+            base_value: 10
+            block:
+              a: 1
+              b: 2
+            """
+        ).strip()
+    )
+    child.write_text(
+        dedent(
+            """
+            defaults: parent.yaml
+            base_value: 11
+            block:
+              b: 3
+              c: 4
+            """
+        ).strip()
+    )
+
+    # expand should merge without resolving interpolations; capture stdout
+    ns = type("NS", (), {"config": str(child), "in_place": False})
+    ret = cli.expand(ns)
+    assert ret == 0
+    out = capsys.readouterr().out
+    # Expect merged keys present
+    assert "base_value: 11" in out
+    assert "a: 1" in out and "b: 3" in out and "c: 4" in out
+
+    # compare identical files prints identical message
+    ns_cmp = type("NS", (), {"left": str(child), "right": str(child)})
+    ret_cmp = cli.compare(ns_cmp)
+    assert ret_cmp == 0
+    out_cmp = capsys.readouterr().out
+    assert "Configs are identical" in out_cmp
+
+    # compare different files prints sections: changed
+    alt = tmp_path / "alt.yaml"
+    alt.write_text(
+        dedent(
+            """
+            defaults: parent.yaml
+            base_value: 12
+            block:
+              a: 9
+              b: 3
+              d: 5
+            """
+        ).strip()
+    )
+    ns_cmp2 = type("NS", (), {"left": str(child), "right": str(alt)})
+    ret_cmp2 = cli.compare(ns_cmp2)
+    assert ret_cmp2 == 0
+    out_cmp2 = capsys.readouterr().out
+    assert "Comparing configs" in out_cmp2
+    assert "Added in Right" in out_cmp2
+    assert "Changed (Left -> Right)" in out_cmp2
+
+
+def test_vendored_loader_behavior_matches_upstream(tmp_path: Path) -> None:
+    # Prepare simple parent/child config files
+    parent = tmp_path / "parent.yaml"
+    child = tmp_path / "child.yaml"
+    parent.write_text(
+        dedent(
+            """
+            base: 1
+            block:
+              a: 2
+              b: 3
+            """
+        ).strip()
+    )
+    child.write_text(
+        dedent(
+            """
+            defaults: parent.yaml
+            base: 9
+            block:
+              b: 7
+              c: 4
+            """
+        ).strip()
+    )
+
+    # Use text-level expansion comparison by importing both implementations
+    # Vendored
+    cli = _load_cli_module()
+    vendored_cfg = cli.load_config_with_inheritance(str(child))
+    vendored = OmegaConf.to_container(vendored_cfg)
+
+    # Upstream via direct import; if it fails, the test should fail
+    import nemo_rl.utils.config as upstream
+
+    upstream_cfg = upstream.load_config_with_inheritance(str(child))
+    upstream_out = OmegaConf.to_container(upstream_cfg)
+
+    assert vendored == upstream_out
+
+
+def test_vendored_loader_drift_against_upstream_source() -> None:
+    # Enforce exact copy-paste: the vendored function's source must match upstream exactly
+    cli = _load_cli_module()
+    vendored_fn = cli.load_config_with_inheritance
+
+    import nemo_rl.utils.config as upstream
+
+    upstream_fn = upstream.load_config_with_inheritance
+
+    up_src = inspect.getsource(upstream_fn).strip()
+    ven_src = inspect.getsource(vendored_fn).strip()
+    assert up_src == ven_src
diff --git a/tools/config_cli.py b/tools/config_cli.py
index 9defc451eb..38500cb02a 100755
--- a/tools/config_cli.py
+++ b/tools/config_cli.py
@@ -79,7 +79,7 @@
 # VENDORED SECTION: Minimal self-contained config loader (no nemo_rl dependency)
 #
 # Original source: `nemo_rl/utils/config.py`
-#   - Functions adapted: `_resolve_path`, `load_config_with_inheritance`, `load_config`
+#   - Functions adapted: `resolve_path`, `load_config_with_inheritance`, `load_config`
 #   - Purpose: avoid importing from nemo_rl so this script is standalone
 #   - If upstream changes, consider updating this vendored block accordingly
 # ============================================================================
@@ -88,58 +88,120 @@
 from omegaconf import DictConfig, ListConfig, OmegaConf
 
 
-def _resolve_path(base_path: Path, path: str) -> Path:
+def resolve_path(base_path: Path, path: str) -> Path:
+    """Resolve a path relative to the base path."""
     if path.startswith("/"):
         return Path(path)
-    return (base_path / path).resolve()
+    return base_path / path
 
 
 def load_config_with_inheritance(
-    config_path: Union[str, Path], base_dir: Optional[Union[str, Path]] = None
+    config_path: Union[str, Path],
+    base_dir: Optional[Union[str, Path]] = None,
 ) -> DictConfig:
-    """Load a YAML config and resolve simple inheritance via a top-level `defaults` key.
+    """Load a config file with inheritance support.
 
-    Supports:
-    - `defaults: parent.yaml` (string)
-    - `defaults: [parent1.yaml, parent2.yaml]` (list)
-    - Nested inheritance via parent files with their own `defaults`.
+    Args:
+        config_path: Path to the config file
+        base_dir: Base directory for resolving relative paths. If None, uses config_path's directory
+
+    Returns:
+        Merged config dictionary
     """
-    config_path = Path(config_path).resolve()
+    config_path = Path(config_path)
     if base_dir is None:
         base_dir = config_path.parent
     base_dir = Path(base_dir)
 
-    cfg = OmegaConf.load(config_path)
-    if not isinstance(cfg, DictConfig):
-        raise TypeError(
-            f"Config at {config_path} must be a mapping (DictConfig), got {type(cfg)}"
-        )
+    config = OmegaConf.load(config_path)
+    assert isinstance(config, DictConfig), (
+        "Config must be a Dictionary Config (List Config not supported)"
+    )
 
-    if "defaults" in cfg:
-        defaults = cfg.pop("defaults")
+    # Handle inheritance
+    if "defaults" in config:
+        defaults = config.pop("defaults")
         if isinstance(defaults, (str, Path)):
-            defaults_list = [str(defaults)]
+            defaults = [defaults]
         elif isinstance(defaults, ListConfig):
-            defaults_list = [str(d) for d in defaults]
-        elif isinstance(defaults, list):
-            defaults_list = [str(d) for d in defaults]
-        else:
-            raise TypeError(
-                f"Unsupported type for defaults: {type(defaults)} in {config_path}"
-            )
+            defaults = [str(d) for d in defaults]
 
-        merged: DictConfig = OmegaConf.create({})  # type: ignore[assignment]
-        for default_entry in defaults_list:
-            parent_path = _resolve_path(base_dir, str(default_entry))
-            parent_cfg = load_config_with_inheritance(parent_path, base_dir)
-            merged = cast(DictConfig, OmegaConf.merge(merged, parent_cfg))
+        # Load and merge all parent configs
+        base_config = OmegaConf.create({})
+        for default in defaults:
+            parent_path = resolve_path(base_dir, str(default))
+            parent_config = load_config_with_inheritance(parent_path, base_dir)
+            base_config = cast(DictConfig, OmegaConf.merge(base_config, parent_config))
 
-        cfg = cast(DictConfig, OmegaConf.merge(merged, cfg))
+        # Merge with current config
+        config = cast(DictConfig, OmegaConf.merge(base_config, config))
 
-    return cfg
+    return config
 
 
 def load_config(config_path: Union[str, Path]) -> DictConfig:
+    """Load a config file with inheritance support and convert it to an OmegaConf object.
+
+    The config inheritance system supports:
+
+    1. Single inheritance:
+        ```yaml
+        # child.yaml
+        defaults: parent.yaml
+        common:
+          value: 43
+        ```
+
+    2. Multiple inheritance:
+        ```yaml
+        # child.yaml
+        defaults:
+          - parent1.yaml
+          - parent2.yaml
+        common:
+          value: 44
+        ```
+
+    3. Nested inheritance:
+        ```yaml
+        # parent.yaml
+        defaults: grandparent.yaml
+        common:
+          value: 43
+
+        # child.yaml
+        defaults: parent.yaml
+        common:
+          value: 44
+        ```
+
+    4. Variable interpolation:
+        ```yaml
+        # parent.yaml
+        base_value: 42
+        derived:
+          value: ${base_value}
+
+        # child.yaml
+        defaults: parent.yaml
+        base_value: 43  # This will update both base_value and derived.value
+        ```
+
+    The system handles:
+    - Relative and absolute paths
+    - Multiple inheritance
+    - Nested inheritance
+    - Variable interpolation
+
+    The inheritance is resolved depth-first, with later configs overriding earlier ones.
+    This means in multiple inheritance, the last config in the list takes precedence.
+
+    Args:
+        config_path: Path to the config file
+
+    Returns:
+        Merged config dictionary
+    """
     return load_config_with_inheritance(config_path)
 
 

From 20ee149506a50c37b018940831617672abf87dd9 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 25 Sep 2025 06:51:11 +0000
Subject: [PATCH 15/15] copyright

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/tools/test_config_cli.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tests/unit/tools/test_config_cli.py b/tests/unit/tools/test_config_cli.py
index 805b5a2a5a..63af6c8294 100644
--- a/tests/unit/tools/test_config_cli.py
+++ b/tests/unit/tools/test_config_cli.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import importlib.util
 import inspect
 import os