From bc48e8465eba0e179997236b6890353548737d16 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 16 Apr 2025 15:28:34 -0700
Subject: [PATCH 01/19] feat: release convergence runs

Signed-off-by: Terry Kong <terryk@nvidia.com>

add qwen

Signed-off-by: Terry Kong <terryk@nvidia.com>

llama3.2 1bb recipe

Signed-off-by: Terry Kong <terryk@nvidia.com>

fsdp2 tests

Signed-off-by: Terry Kong <terryk@nvidia.com>

NUM_HOURS

Signed-off-by: Terry Kong <terryk@nvidia.com>

add more tests for sft dtensor

Signed-off-by: Terry Kong <terryk@nvidia.com>

move things around

Signed-off-by: Terry Kong <terryk@nvidia.com>

fix

Signed-off-by: Terry Kong <terryk@nvidia.com>

update

Signed-off-by: Terry Kong <terryk@nvidia.com>

correct the paths

Signed-off-by: Terry Kong <terryk@nvidia.com>

missing copyright

Signed-off-by: Terry Kong <terryk@nvidia.com>

add code snapshot and coninueing ability

Signed-off-by: Terry Kong <terryk@nvidia.com>

moving things around

Signed-off-by: Terry Kong <terryk@nvidia.com>

performance directory

Signed-off-by: Terry Kong <terryk@nvidia.com>

misplaced .gitkeep

Signed-off-by: Terry Kong <terryk@nvidia.com>

recursive glob

Signed-off-by: Terry Kong <terryk@nvidia.com>

fix

Signed-off-by: Terry Kong <terryk@nvidia.com>

move to minutes which is better granularity for the perf team

Signed-off-by: Terry Kong <terryk@nvidia.com>

update hermetic

Signed-off-by: Terry Kong <terryk@nvidia.com>

move things around

Signed-off-by: Terry Kong <terryk@nvidia.com>

fix up

Signed-off-by: Terry Kong <terryk@nvidia.com>

readme cleanup

Signed-off-by: Terry Kong <terryk@nvidia.com>

add model names to guard from model names changing from the default
configs

Signed-off-by: Terry Kong <terryk@nvidia.com>

cleanup

Signed-off-by: Terry Kong <terryk@nvidia.com>

test cases

Signed-off-by: Terry Kong <terryk@nvidia.com>

missing files

Signed-off-by: Terry Kong <terryk@nvidia.com>

fix all tests

Signed-off-by: Terry Kong <terryk@nvidia.com>

incorporated everyone's feedback

Signed-off-by: Terry Kong <terryk@nvidia.com>

make sure hfhome/cache are propagated

Signed-off-by: Terry Kong <terryk@nvidia.com>

project_root correction

Signed-off-by: Terry Kong <terryk@nvidia.com>

launch MOUNTS typo

Signed-off-by: Terry Kong <terryk@nvidia.com>

docs

Signed-off-by: Terry Kong <terryk@nvidia.com>

copyright

Signed-off-by: Terry Kong <terryk@nvidia.com>

typo

Signed-off-by: Terry Kong <terryk@nvidia.com>

clean up docs

Signed-off-by: Terry Kong <terryk@nvidia.com>

wip

Signed-off-by: Terry Kong <terryk@nvidia.com>

fix the mount

Signed-off-by: Terry Kong <terryk@nvidia.com>

fix all 70b  -> 32b tests

Signed-off-by: Terry Kong <terryk@nvidia.com>

get all the test step times down

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .gitignore                                    |   2 +
 nemo_rl/__init__.py                           |  13 ++
 recipes/README.md                             |  67 ++++++
 ...llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh |  65 ++++++
 ...grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh |  65 ++++++
 ...en2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh |  69 ++++++
 ...po-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh |  69 ++++++
 .../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh    |  64 ++++++
 ...rpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh |  67 ++++++
 ...wen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh |  65 ++++++
 recipes/llm/performance/.gitkeep              |   0
 .../sft-llama3.1-8b-instruct-1n8g-fsdp1.sh    |  66 ++++++
 ...llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh |  68 ++++++
 ...ft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh |  68 ++++++
 recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh  |  66 ++++++
 ...sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh |  70 ++++++
 tests/README.md                               |  20 ++
 tests/{functional => }/check_metrics.py       |   0
 tests/functional/dpo.sh                       |  18 +-
 tests/functional/grpo.sh                      |  19 +-
 tests/functional/sft.sh                       |  21 +-
 tests/{functional => }/json_dump_tb_logs.py   |   0
 tests/test_suites/nightly.txt                 |  28 +++
 tests/test_suites/nightly_performance.txt     |   0
 tests/test_suites/release.txt                 |  16 ++
 tests/test_suites/release_performance.txt     |   0
 tests/unit/test_recipes_and_test_suites.py    | 200 ++++++++++++++++++
 tools/autoformat.sh                           |   0
 tools/code_snapshot.sh                        |  40 ++++
 tools/launch                                  | 175 +++++++++++++++
 tools/package_release_runs.sh                 |  41 ++++
 31 files changed, 1434 insertions(+), 28 deletions(-)
 create mode 100644 recipes/README.md
 create mode 100755 recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
 create mode 100755 recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
 create mode 100755 recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
 create mode 100755 recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
 create mode 100755 recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
 create mode 100755 recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
 create mode 100755 recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
 create mode 100644 recipes/llm/performance/.gitkeep
 create mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
 create mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
 create mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
 create mode 100755 recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
 create mode 100755 recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
 create mode 100644 tests/README.md
 rename tests/{functional => }/check_metrics.py (100%)
 rename tests/{functional => }/json_dump_tb_logs.py (100%)
 create mode 100644 tests/test_suites/nightly.txt
 create mode 100644 tests/test_suites/nightly_performance.txt
 create mode 100644 tests/test_suites/release.txt
 create mode 100644 tests/test_suites/release_performance.txt
 create mode 100644 tests/unit/test_recipes_and_test_suites.py
 mode change 100644 => 100755 tools/autoformat.sh
 create mode 100644 tools/code_snapshot.sh
 create mode 100755 tools/launch
 create mode 100755 tools/package_release_runs.sh

diff --git a/.gitignore b/.gitignore
index 478990ddc8..12121a4155 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,8 @@ apidocs/
 dist/
 *.egg-info/
 *.vscode/
+release_run*
+ckpts/
 
 # Test
 coverage.json
diff --git a/nemo_rl/__init__.py b/nemo_rl/__init__.py
index 1606956b87..c755e5ed0f 100644
--- a/nemo_rl/__init__.py
+++ b/nemo_rl/__init__.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from nemo_rl.package_info import (
     __contact_emails__,
diff --git a/recipes/README.md b/recipes/README.md
new file mode 100644
index 0000000000..3ccf0d75c9
--- /dev/null
+++ b/recipes/README.md
@@ -0,0 +1,67 @@
+# Recipes
+
+## Naming
+
+Each test is named:
+```
+<algo>-<model>-#n#g-<parallelism>-<opt:long>.sh
+```
+
+Examples:
+* sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2.sh
+* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.sh
+
+## Running manually
+
+Each recipe can be run on the head node:
+
+```sh
+uv run ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+```
+
+and the result directory can be found at the same level of the script (w/o `.sh` prefix):
+
+```sh
+ls -lh llm/sft-llama3.2-1b-1n8g-fsdp2tp1/
+# drwxr-xr-x 2 terryk dip 4.0K Apr 23 18:07 ckpts
+# drwxr-xr-x 3 terryk dip 4.0K Apr 23 18:07 logs
+# -rw-r--r-- 1 terryk dip 142K Apr 23 18:23 metrics.json
+# -rw-r--r-- 1 terryk dip  94K Apr 23 18:23 run.log
+```
+
+## Launching with code snapshots
+
+We provide a convenience script that will create a code snapshot and launch `NUM_RUNS` number of slurm jobs (`NUM_RUNS` is defined in the script itself). We create a code snapshot to
+ensure that even as the master repo changes its code, you can always run your experiment with
+the snapshot of the code at the time the experiment was initially launched.
+
+```sh
+# Launch
+CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+
+# Prints Estimated GPUhrs and then exits
+DRYRUN=1 CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+
+# Prints Estimated GPUhrs, creates code snapshot, then exits
+DRYRUN=2 CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+```
+
+After this completes, you can find the result under
+
+```sh
+ls -lh ../code_snapshots/sft-llama3.2-1b-1n8g-fsdp2tp1/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1/
+# drwxr-xr-x 2 terryk dip 4.0K Apr 23 18:07 ckpts
+# drwxr-xr-x 3 terryk dip 4.0K Apr 23 18:07 logs
+# -rw-r--r-- 1 terryk dip 142K Apr 23 18:23 metrics.json
+# -rw-r--r-- 1 terryk dip  94K Apr 23 18:23 run.log
+```
+
+As a convenience, there's also a `continue.sh` script under that will launch
+another run using the same arguments. This is helpful if your job was
+unexpectedly cancelled or you want to run it for a little longer.
+
+```sh
+# This launches one more run of the same experiment
+../code_snapshots/sft-llama3.2-1b-1n8g-fsdp2tp1/continue.sh
+```
diff --git a/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
new file mode 100755
index 0000000000..3feb431a2f
--- /dev/null
+++ b/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=100
+MAX_STEPS=500
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_grpo_math.py \
+    --config examples/configs/grpo_math_8B.yaml \
+    policy.model_name=meta-llama/Llama-3.1-8B-Instruct \
+    policy.dtensor_cfg.enabled=true \
+    policy.dtensor_cfg.tensor_parallel_size=1 \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["100"] < 1.1'
+fi
+
diff --git a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
new file mode 100755
index 0000000000..14df2cfe5f
--- /dev/null
+++ b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=500
+MAX_STEPS=500
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_grpo_math.py \
+    --config examples/configs/grpo_math_1B.yaml \
+    policy.model_name=meta-llama/Llama-3.2-1B-Instruct \
+    policy.dtensor_cfg.enabled=true \
+    policy.dtensor_cfg.tensor_parallel_size=1 \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["100"] < 1.1'
+fi
+
diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
new file mode 100755
index 0000000000..04a380c746
--- /dev/null
+++ b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=16
+STEPS_PER_RUN=10
+MAX_STEPS=20
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_grpo_math.py \
+    --config examples/configs/grpo_math_8B.yaml \
+    policy.model_name=Qwen/Qwen2.5-32B \
+    policy.dtensor_cfg.enabled=True \
+    policy.dtensor_cfg.tensor_parallel_size=8 \
+    policy.dtensor_cfg.sequence_parallel=True \
+    policy.dtensor_cfg.activation_checkpointing=True \
+    policy.generation.vllm_cfg.tensor_parallel_size=4 \
+    policy.max_total_sequence_length=16384 \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["100"] < 1.1'
+fi
+
diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
new file mode 100755
index 0000000000..466b1a41ec
--- /dev/null
+++ b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=16
+STEPS_PER_RUN=2  # 40min: step_time: [1341, 801]
+MAX_STEPS=2
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=60
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_grpo_math.py \
+    --config examples/configs/grpo_math_8B.yaml \
+    policy.model_name=Qwen/Qwen2.5-32B \
+    policy.dtensor_cfg.enabled=True \
+    policy.dtensor_cfg.tensor_parallel_size=8 \
+    policy.dtensor_cfg.sequence_parallel=True \
+    policy.dtensor_cfg.activation_checkpointing=True \
+    policy.generation.vllm_cfg.tensor_parallel_size=4 \
+    policy.max_total_sequence_length=16384 \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["160"] < 1.1'
+fi
+
diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
new file mode 100755
index 0000000000..cb8f5c9bca
--- /dev/null
+++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=30
+MAX_STEPS=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=60
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_grpo_math.py \
+    --config examples/configs/grpo_math_8B.yaml \
+    policy.model_name=Qwen/Qwen2.5-7B-Instruct \
+    policy.dtensor_cfg.enabled=false \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["160"] < 1.1'
+fi
+
diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
new file mode 100755
index 0000000000..a4037b01d7
--- /dev/null
+++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=30
+MAX_STEPS=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=180
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_grpo_math.py \
+    --config examples/configs/grpo_math_8B.yaml \
+    policy.model_name=Qwen/Qwen2.5-7B-Instruct \
+    policy.dtensor_cfg.enabled=True \
+    policy.dtensor_cfg.tensor_parallel_size=4 \
+    policy.dtensor_cfg.sequence_parallel=True \
+    policy.generation.vllm_cfg.tensor_parallel_size=4 \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["160"] < 1.1'
+fi
+
diff --git a/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
new file mode 100755
index 0000000000..89bf673983
--- /dev/null
+++ b/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=450
+MAX_STEPS=450
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_grpo_math.py \
+    --config examples/configs/grpo_math_1B.yaml \
+    policy.model_name=Qwen/Qwen2.5-Math-1.5B-Instruct \
+    policy.dtensor_cfg.enabled=true \
+    policy.dtensor_cfg.tensor_parallel_size=1 \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["100"] < 1.1'
+fi
+
diff --git a/recipes/llm/performance/.gitkeep b/recipes/llm/performance/.gitkeep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
new file mode 100755
index 0000000000..70f834e0a8
--- /dev/null
+++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=500
+MAX_STEPS=500
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=15
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_sft.py \
+    --config examples/configs/sft.yaml \
+    policy.model_name=meta-llama/Llama-3.1-8B-Instruct \
+    policy.precision=bfloat16 \
+    policy.dtensor_cfg.enabled=False \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    # TODO: FIGURE OUT CORRECT METRICS
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 2.4' \
+        'data["train/loss"]["60"] < 0.45' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+fi 
diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
new file mode 100755
index 0000000000..1402a094dc
--- /dev/null
+++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -eou pipefail
+
+# TODO: @ashors real convergence run
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=12000
+MAX_STEPS=12000
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_sft.py \
+    --config examples/configs/sft.yaml \
+    policy.model_name=meta-llama/Llama-3.1-8B-Instruct \
+    policy.precision=bfloat16 \
+    policy.dtensor_cfg.enabled=True \
+    policy.dtensor_cfg.tensor_parallel_size=1 \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    # TODO: FIGURE OUT CORRECT METRICS
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 2.4' \
+        'data["train/loss"]["60"] < 0.45' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+fi 
diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
new file mode 100755
index 0000000000..3f5ce413eb
--- /dev/null
+++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=350
+MAX_STEPS=350
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=30
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_sft.py \
+    --config examples/configs/sft.yaml \
+    policy.model_name=meta-llama/Llama-3.1-8B-Instruct \
+    policy.precision=bfloat16 \
+    policy.dtensor_cfg.enabled=True \
+    policy.dtensor_cfg.tensor_parallel_size=2 \
+    policy.dtensor_cfg.sequence_parallel=True \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    # TODO: FIGURE OUT CORRECT METRICS
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 2.4' \
+        'data["train/loss"]["60"] < 0.45' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+fi 
diff --git a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
new file mode 100755
index 0000000000..2c6f930399
--- /dev/null
+++ b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=1000
+MAX_STEPS=1000
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=15
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_sft.py \
+    --config examples/configs/sft.yaml \
+    policy.model_name=meta-llama/Llama-3.2-1B \
+    policy.dtensor_cfg.enabled=True \
+    policy.dtensor_cfg.tensor_parallel_size=1 \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 2.4' \
+        'data["train/loss"]["60"] < 0.45' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+fi
+
diff --git a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
new file mode 100755
index 0000000000..ad23383dc9
--- /dev/null
+++ b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+set -eou pipefail
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=20  # step_time ~ 29sec
+MAX_STEPS=20
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=30
+# ===== END CONFIG =====
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+# Early stopping to save compute if max step has been reached
+STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+    exit 0
+fi
+echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+
+# Run the experiment
+cd $PROJECT_ROOT
+python -u examples/run_sft.py \
+    --config examples/configs/sft.yaml \
+    policy.model_name=Qwen/Qwen2.5-32B \
+    policy.precision=bfloat16 \
+    policy.dtensor_cfg.enabled=True \
+    policy.dtensor_cfg.tensor_parallel_size=8 \
+    policy.dtensor_cfg.sequence_parallel=True \
+    policy.dtensor_cfg.activation_checkpointing=True \
+    policy.max_total_sequence_length=16000 \
+    cluster.num_nodes=$NUM_NODES \
+    cluster.gpus_per_node=8 \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    # TODO: FIGURE OUT CORRECT METRICS
+    python -u tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 2.4' \
+        'data["train/loss"]["60"] < 0.45' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+fi 
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000000..4e51a6efad
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,20 @@
+# Tests
+
+## Launching Release Tests
+
+```sh
+# Assuming in NeMo RL project root
+
+cd tools/
+
+IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch <script_path> <another_script_path> ...
+
+# DRYRUN=1 to get a rough estimate of compute
+DRYRUN=1 IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch <script_path> <another_script_path> ...
+
+# DRYRUN=2 will create a codesnapshot with a fully hermetic example
+DRYRUN=2 IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch <script_path> <another_script_path> ...
+
+# Run all (Caution: this will use a lot of compute; consider listing out the jobs)
+IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch ../../recipes/**/*.sh
+```
diff --git a/tests/functional/check_metrics.py b/tests/check_metrics.py
similarity index 100%
rename from tests/functional/check_metrics.py
rename to tests/check_metrics.py
diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh
index 2421c5da6a..e719f84b79 100755
--- a/tests/functional/dpo.sh
+++ b/tests/functional/dpo.sh
@@ -7,15 +7,16 @@ git config --global --add safe.directory $PROJECT_ROOT
 
 set -eou pipefail
 
-LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
-JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
-RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
-export RAY_DEDUP_LOGS=0
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
 export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
 export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
 
-rm -rf $LOG_DIR
-mkdir -p $LOG_DIR
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
 
 cd $PROJECT_ROOT
 python -u $PROJECT_ROOT/examples/run_dpo.py \
@@ -31,9 +32,8 @@ python -u $PROJECT_ROOT/examples/run_dpo.py \
     $@ \
     2>&1 | tee $RUN_LOG
 
-cd $SCRIPT_DIR
-python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-python check_metrics.py $JSON_METRICS \
+python -u tests/check_metrics.py $JSON_METRICS \
   'data["train/loss"]["2"] < 0.694' \
 
diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh
index b61442227b..93b4ec25e1 100755
--- a/tests/functional/grpo.sh
+++ b/tests/functional/grpo.sh
@@ -2,19 +2,21 @@
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
+# Mark the current repo as safe, since wandb fetches metadata about the repo
 git config --global --add safe.directory $PROJECT_ROOT
 
 set -eou pipefail
 
-LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
-JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
-RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
 export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
 export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
 
-rm -rf $LOG_DIR
-mkdir -p $LOG_DIR
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
 
 cd $PROJECT_ROOT
 python -u $PROJECT_ROOT/examples/run_grpo_math.py \
@@ -27,9 +29,8 @@ python -u $PROJECT_ROOT/examples/run_grpo_math.py \
     $@ \
     2>&1 | tee $RUN_LOG
 
-cd $SCRIPT_DIR
-python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-python check_metrics.py $JSON_METRICS \
+python -u tests/check_metrics.py $JSON_METRICS \
     'max(data["train/token_mult_prob_error"]) < 1.1' \
 
diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh
index f3474fb0fd..812733338c 100755
--- a/tests/functional/sft.sh
+++ b/tests/functional/sft.sh
@@ -1,23 +1,25 @@
 #!/bin/bash
 
-## clean up checkpoint directory on exit
+# clean up checkpoint directory on exit
 trap "rm -rf /tmp/sft_checkpoints" EXIT
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
+# Mark the current repo as safe, since wandb fetches metadata about the repo
 git config --global --add safe.directory $PROJECT_ROOT
 
 set -eou pipefail
 
-LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
-JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
-RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
 export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
 export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
 
-rm -rf $LOG_DIR
-mkdir -p $LOG_DIR
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
 
 cd $PROJECT_ROOT
 python -u $PROJECT_ROOT/examples/run_sft.py \
@@ -34,10 +36,9 @@ python -u $PROJECT_ROOT/examples/run_sft.py \
     $@ \
     2>&1 | tee $RUN_LOG
 
-cd $SCRIPT_DIR
-python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 # TODO: loss is very noisy, this check is mainly for sanity of immediate divergence
-python check_metrics.py $JSON_METRICS \
+python -u tests/check_metrics.py $JSON_METRICS \
   'data["train/loss"]["9"] < 1500' \
 
diff --git a/tests/functional/json_dump_tb_logs.py b/tests/json_dump_tb_logs.py
similarity index 100%
rename from tests/functional/json_dump_tb_logs.py
rename to tests/json_dump_tb_logs.py
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
new file mode 100644
index 0000000000..9b4eac9491
--- /dev/null
+++ b/tests/test_suites/nightly.txt
@@ -0,0 +1,28 @@
+########
+# GRPO #
+########
+
+# Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much
+recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
+recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
+
+# FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct)
+recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
+recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
+
+# Functional 32b run
+recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
+
+#######
+# SFT #
+#######
+
+# 1N 1B/8B runs
+recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+
+# Dtensor vs fsdp1 (8B)
+recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
+recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
+
+# Functional 32b test
+recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
diff --git a/tests/test_suites/nightly_performance.txt b/tests/test_suites/nightly_performance.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt
new file mode 100644
index 0000000000..dfc997435b
--- /dev/null
+++ b/tests/test_suites/release.txt
@@ -0,0 +1,16 @@
+########
+# GRPO #
+########
+
+# Long 8b run
+recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
+
+# Long 32b run
+recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
+
+#######
+# SFT #
+#######
+
+# Long 8b convergence
+recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
\ No newline at end of file
diff --git a/tests/test_suites/release_performance.txt b/tests/test_suites/release_performance.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
new file mode 100644
index 0000000000..100486ddf4
--- /dev/null
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import os
+import glob
+import subprocess
+
+dir_path = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.abspath(os.path.join(dir_path, "..", ".."))
+recipes_dir = os.path.join(project_root, "recipes")
+
+test_suites_dir = os.path.join(project_root, "tests", "test_suites")
+
+nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt")
+release_test_suite_path = os.path.join(test_suites_dir, "release.txt")
+nightly_performance_test_suite_path = os.path.join(
+    test_suites_dir, "nightly_performance.txt"
+)
+release_performance_test_suite_path = os.path.join(
+    test_suites_dir, "release_performance.txt"
+)
+
+
+@pytest.fixture
+def nightly_test_suite():
+    nightly_suite = []
+    with open(nightly_test_suite_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                nightly_suite.append(line)
+    return nightly_suite
+
+
+@pytest.fixture
+def release_test_suite():
+    release_suite = []
+    with open(release_test_suite_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                release_suite.append(line)
+    return release_suite
+
+
+@pytest.fixture
+def nightly_performance_test_suite():
+    nightly_performance_suite = []
+    with open(nightly_performance_test_suite_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                nightly_performance_suite.append(line)
+    return nightly_performance_suite
+
+
+@pytest.fixture
+def release_performance_test_suite():
+    release_performance_suite = []
+    with open(release_performance_test_suite_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                release_performance_suite.append(line)
+    return release_performance_suite
+
+
+@pytest.fixture
+def all_test_suites(
+    nightly_test_suite,
+    release_test_suite,
+    nightly_performance_test_suite,
+    release_performance_test_suite,
+):
+    return (
+        nightly_test_suite
+        + release_test_suite
+        + nightly_performance_test_suite
+        + release_performance_test_suite
+    )
+
+
+@pytest.mark.parametrize(
+    "test_suite_path",
+    [
+        nightly_test_suite_path,
+        release_test_suite_path,
+        nightly_performance_test_suite_path,
+        release_performance_test_suite_path,
+    ],
+    ids=[
+        "nightly_test_suite",
+        "release_test_suite",
+        "nightly_performance_test_suite",
+        "release_performance_test_suite",
+    ],
+)
+def test_test_suites_exist(test_suite_path):
+    assert os.path.exists(test_suite_path), (
+        f"Test suite {test_suite_path} does not exist"
+    )
+
+
+def test_no_overlap_across_test_suites(all_test_suites):
+    recipes = set(all_test_suites)
+    assert len(recipes) == len(all_test_suites), f"Test suites have repeats {recipes}"
+
+
+def test_all_recipes_accounted_for_in_test_suites(all_test_suites):
+    all_recipes_in_test_suites = set(all_test_suites)
+
+    all_recipes_in_recipes_dir = set()
+    for recipe_path in glob.glob(
+        os.path.join(recipes_dir, "**", "*.sh"), recursive=True
+    ):
+        # Strip off the project root and leading slash
+        recipe_name = recipe_path[len(project_root) + 1 :]
+        all_recipes_in_recipes_dir.add(recipe_name)
+
+    assert all_recipes_in_test_suites == all_recipes_in_recipes_dir, (
+        "All recipes are not accounted for in the test suites"
+    )
+
+
+def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker):
+    command = f"DRYRUN=1 CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}"
+
+    print(f"Running command: {command}")
+
+    # Run the command from the project root directory
+    result = subprocess.run(
+        command,
+        shell=True,
+        cwd=project_root,
+        capture_output=True,
+        text=True,
+        check=False,  # Don't raise exception on non-zero exit code
+    )
+
+    # Print stdout and stderr for debugging if the test fails
+    print("STDOUT:")
+    print(result.stdout)
+    print("STDERR:")
+    print(result.stderr)
+
+    # Assert that the command exited successfully
+    assert result.returncode == 0, f"Command failed with exit code {result.returncode}"
+
+    # Assert that the last line of stdout contains the expected prefix
+    stdout_lines = result.stdout.strip().splitlines()
+    assert len(stdout_lines) > 0, "Command produced no output"
+    last_line = stdout_lines[-1]
+    assert last_line.startswith("[INFO]: Total GPU hours:"), (
+        f"Last line of output was not as expected: '{last_line}'"
+    )
+    total_gpu_hours = float(last_line.split(":")[-1].strip())
+    assert total_gpu_hours <= 1024, f"Total GPU hours exceeded 1024: {last_line}"
+    tracker.track("total_nightly_gpu_hours", total_gpu_hours)
+
+
+def test_dry_run_does_not_fail_and_prints_total_gpu_hours():
+    command = "DRYRUN=1 CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./recipes/**/*.sh"
+
+    # Run the command from the project root directory
+    result = subprocess.run(
+        command,
+        shell=True,
+        cwd=project_root,
+        capture_output=True,
+        text=True,
+        check=False,  # Don't raise exception on non-zero exit code
+    )
+
+    # Print stdout and stderr for debugging if the test fails
+    print("STDOUT:")
+    print(result.stdout)
+    print("STDERR:")
+    print(result.stderr)
+
+    # Assert that the command exited successfully
+    assert result.returncode == 0, f"Command failed with exit code {result.returncode}"
+
+    # Assert that the last line of stdout contains the expected prefix
+    stdout_lines = result.stdout.strip().splitlines()
+    assert len(stdout_lines) > 0, "Command produced no output"
+    last_line = stdout_lines[-1]
+    assert last_line.startswith("[INFO]: Total GPU hours:"), (
+        f"Last line of output was not as expected: '{last_line}'"
+    )
diff --git a/tools/autoformat.sh b/tools/autoformat.sh
old mode 100644
new mode 100755
diff --git a/tools/code_snapshot.sh b/tools/code_snapshot.sh
new file mode 100644
index 0000000000..62136a8632
--- /dev/null
+++ b/tools/code_snapshot.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -euo pipefail
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+PROJECT_ROOT=${SCRIPT_DIR}/..
+cd ${PROJECT_ROOT}
+
+echo2() {
+    echo "$@" >&2
+}
+
+if [[ ! -e "$PROJECT_ROOT/.git" ]]; then
+  echo2 "[Error]: This script was not run from the root of NeMo RL git repo. Please clone it first."
+  exit 1
+elif [[ $# -lt 1 ]]; then
+  echo2 "[Error]: This script requires one argument: the name of the experiment to be used as the snapshot directory name"
+  echo2 "Usage: bash tools/code_snapshot.sh <experiment_name>"
+  exit 1
+fi
+
+EXP_NAME=$1
+
+SNAPSHOT_DIR="$PROJECT_ROOT/code_snapshots/${EXP_NAME}"
+if [[ ! -d "$SNAPSHOT_DIR" ]]; then
+  echo2 "Creating new code snapshot in $SNAPSHOT_DIR"
+  mkdir -p $SNAPSHOT_DIR
+else
+  echo2 "Using existing code snapshot in $SNAPSHOT_DIR"
+  # Echo the snapshot directory so the caller can use it to `cd` into it
+  echo ${SNAPSHOT_DIR}
+  exit
+fi
+
+echo2 "Copying git-tracked files..."
+rsync -a --files-from=<(git ls-files) ./ $SNAPSHOT_DIR/
+
+
+# Echo the snapshot directory so the caller can use it to `cd` into it
+echo ${SNAPSHOT_DIR}
\ No newline at end of file
diff --git a/tools/launch b/tools/launch
new file mode 100755
index 0000000000..1db03b3b03
--- /dev/null
+++ b/tools/launch
@@ -0,0 +1,175 @@
+#!/bin/bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+# This is a helper script to launch a release test on slurm.
+# It reads a demarcated section of the script to extract the config,
+# and uses that to determine how many nodes and how many chained jobs to launch.
+#
+# It also creates a code snapshot to ensure that the code is reproducible and subsequent
+# jobs can be launched with the same code. It also creates a continue.sh in the code
+# snapshot directory to continue launching the job even if the original invocation was
+# forgotten.
+#
+# Usage:
+#   CONTAINER=... ACCOUNT=... PARTITION=... ./launch <script_path> <another_script_path> ...
+#
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/..)
+
+# Function to extract config from a script
+extract_config() {
+    local script_path="$1"
+    local config=$(sed -n '/^# =\+ BEGIN CONFIG =\+/,/^# =\+ END CONFIG =\+/p' "$script_path" | 
+                   grep -v "^#" | 
+                   grep "=" )
+    if [[ -z "$config" ]]; then
+        echo "[ERROR]: No config section found in script_path=$script_path"
+        echo "[ERROR]: Please add and update a section in the script with these variables:"
+        echo
+        echo "# ===== BEGIN CONFIG ====="
+        echo "NUM_NODES=1"       # How many nodes this job uses
+        echo "STEPS_PER_RUN=60"  # Approximately how many steps reached in one job
+        echo "MAX_STEPS=60"      # Max training steps
+        echo 'NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up'
+        echo "NUM_MINUTES=240"   # How many minutes one job is (SLURM specific)
+        echo "# ===== END CONFIG ====="
+        return 1
+    fi 1>&2
+    echo "$config"
+}
+
+check_file_in_version_control_and_get_relpath_from_git_root() {
+    local script_path="$1"
+    # Check if the script is tracked in git (assumes we're in the repo already)
+    rel_path_from_git_root=$(git ls-files --full-name --error-unmatch "$script_path")
+    ret_code=$?
+    if [[ $ret_code -ne 0 ]]; then
+        echo "[ERROR]: Script '$script_path' is not tracked in version control." >&2
+        echo "[ERROR]: This may cause reproducibility issues. Add it to git to continue." >&2
+        return 1
+    fi
+    echo "$rel_path_from_git_root"
+}
+
+set -eou pipefail
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: No script provided."
+    echo "Usage: CONTAINER=... ACCOUNT=... PARTITION=... $0 <script_path> <another_script_path> ..."
+    exit 1
+fi
+
+# Check for mandatory environment variables
+for VAR in "HF_HOME" "HF_DATASETS_CACHE"; do
+    if [[ -z "${!VAR:-}" ]]; then
+        echo "[ERROR]: $VAR environment variable is not set."
+        echo "[ERROR]: Please set $VAR to specify the appropriate Hugging Face directory."
+        echo "Example: export $VAR=/path/to/appropriate/directory"
+        exit 1
+    fi
+done
+
+CONTAINER=$CONTAINER
+ACCOUNT=$ACCOUNT
+PARTITION=$PARTITION
+MOUNTS=${MOUNTS:-}
+# DRYRUN=1 prints the runs and how much compute they use
+# DRYRUN=2 additionally creates the snapshots (helpful to run a hermetic example manually or share a repro)
+DRYRUN=${DRYRUN:-}
+IS_RELEASE=${IS_RELEASE:-}  # Adds extra configuration for wandb to track this in the right project
+NOW=$(date '+%y%m%d-%H%M%S')
+
+if [[ -n "$MOUNTS" ]]; then
+    # Comma needed since we always mount PWD
+    MOUNTS=",$MOUNTS"
+fi
+
+SCRIPTS=""
+for SCRIPT in $@; do
+    if [[ ! -f "$SCRIPT" ]]; then
+        echo "Error: Script '$SCRIPT' does not exist or is not a file."
+        echo "Please provide a valid script path."
+        exit 1
+    fi
+    SCRIPTS+=" $SCRIPT"
+done
+
+total_gpu_hours=0
+
+for SCRIPT in $SCRIPTS; do
+    # Extract and evaluate the config
+    if ! config=$(extract_config $SCRIPT); then
+        # Error message is already printed by extract_config
+        exit 1
+    fi
+    eval "$config"
+
+    # NUM_RUNS * NUM_NODES * NUM_GPUS * (NUM_MINUTES / 60)
+    gpu_hours=$((NUM_RUNS * NUM_NODES * 8 * NUM_MINUTES / 60))
+    total_gpu_hours=$((total_gpu_hours + gpu_hours))
+    echo "[INFO]: $gpu_hours GPUhrs to run $SCRIPT"
+    if [[ "${DRYRUN}" -eq 1 ]]; then
+        echo "[DRY_RUN]: Skipping creation of snapshot and submission of $SCRIPT."
+        continue
+    fi
+
+    rel_script=$(check_file_in_version_control_and_get_relpath_from_git_root $SCRIPT)
+    
+    EXP_NAME=$(basename $SCRIPT .sh)
+    SNAPSHOT_DIR=$(bash $PROJECT_ROOT/tools/code_snapshot.sh $EXP_NAME)
+
+    # Now use the variables
+    for i in $(seq 1 $NUM_RUNS); do
+        echo "Submitting $i/$NUM_RUNS job with ${NUM_NODES} nodes for $(basename $SCRIPT)"
+        JOB_NAME=$(basename $SCRIPT .sh)
+
+        RELEASE_ARGS=()
+        if [[ -n "${IS_RELEASE}" ]]; then
+            RELEASE_ARGS=(
+                logger.wandb.project=nemo-rl-release
+                logger.wandb.name=$(basename $SCRIPT .sh)-$(git rev-parse --short HEAD)
+            )
+        fi
+    
+        # TODO: jq install is just to be backward compatible with older containers. Should eventually remove.
+        cat <<EOF >$SNAPSHOT_DIR/continue.sh
+#!/bin/bash
+SCRIPT_DIR=\$( cd -- "\$( dirname -- "\${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+cd \$SCRIPT_DIR
+
+HF_HOME=$HF_HOME \\
+HF_DATASETS_CACHE=$HF_DATASETS_CACHE \\
+COMMAND="apt install -y jq && uv run $rel_script ${RELEASE_ARGS[@]}" \\
+CONTAINER=$CONTAINER \\
+MOUNTS="$SNAPSHOT_DIR:$SNAPSHOT_DIR${MOUNTS}" \\
+sbatch \\
+    --nodes=$NUM_NODES \\
+    --account=$ACCOUNT \\
+    --job-name=$ACCOUNT:$JOB_NAME \\
+    --partition=$PARTITION \\
+    --time=0:${NUM_MINUTES}:0 \\
+    --gres=gpu:8 \\
+    --output=slurm-${NOW}-%j-${JOB_NAME}-${i}.${NUM_RUNS}.out \\
+    ray.sub
+EOF
+        if [[ "${DRYRUN}" -eq 2 ]]; then
+            echo "[DRY_RUN]: Skipping submission of $SCRIPT. Find the snapshot at $SNAPSHOT_DIR and manually launch with 'bash continue.sh'"
+        else
+            bash $SNAPSHOT_DIR/continue.sh
+        fi
+    done
+done
+echo [INFO]: Total GPU hours: $total_gpu_hours
diff --git a/tools/package_release_runs.sh b/tools/package_release_runs.sh
new file mode 100755
index 0000000000..bf8aa1befc
--- /dev/null
+++ b/tools/package_release_runs.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# This script packages all release runs into a tarball with a git SHA so that we can upload to our
+# release page. The SHA is to avoid conflicts with previous runs, but when we upload we should
+# remove that so that users can expect that the name is release_runs.tar.gz (this renaming can be
+# done in the Github Release UI).
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+PROJECT_ROOT=$(realpath $SCRIPT_DIR/..)
+cd $PROJECT_ROOT
+
+set -eou pipefail
+
+# Create a temporary directory
+TMP_DIR=$(mktemp -d)
+echo "Created temporary directory: $TMP_DIR"
+
+# Loop over all the recipe runs and package them into a tarball
+for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do
+    exp_name=$(basename -- $(cut -d/ -f3 <<<$tbevent) -logs)
+    # Obfuscate the hostname
+    # events.out.tfevents.1744822578.<host-name>.780899.0
+    obfuscated_event_path=$(basename $tbevent | awk -F. '{print $1"."$2"."$3"."$4".HOSTNAME."$(NF-1)"."$NF}')
+    
+    # Create subdirectory for experiment if it doesn't exist
+    mkdir -p "$TMP_DIR/$exp_name"
+    
+    # Copy the event file with obfuscated name to the experiment subdirectory
+    cp "$tbevent" "$TMP_DIR/$exp_name/$obfuscated_event_path"
+    
+    echo "[$exp_name] Copied $tbevent to $TMP_DIR/$exp_name/$obfuscated_event_path"
+done
+
+# Create a tarball of all the processed event files
+OUTPUT_TAR="release_runs-$(git rev-parse --short HEAD).tar.gz"
+tar -czf "$OUTPUT_TAR" -C "$TMP_DIR" .
+echo "Created tarball: $OUTPUT_TAR"
+
+# Clean up the temporary directory
+rm -rf "$TMP_DIR"
+echo "Cleaned up temporary directory $TMP_DIR"

From f3aea5b982174d68446b462e04a7152ed455b53e Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 24 Apr 2025 12:21:46 -0700
Subject: [PATCH 02/19] fix unit tests

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/test_recipes_and_test_suites.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
index 100486ddf4..244dde7b4e 100644
--- a/tests/unit/test_recipes_and_test_suites.py
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -134,7 +134,7 @@ def test_all_recipes_accounted_for_in_test_suites(all_test_suites):
 
 
 def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker):
-    command = f"DRYRUN=1 CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}"
+    command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}"
 
     print(f"Running command: {command}")
 
@@ -170,7 +170,7 @@ def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker):
 
 
 def test_dry_run_does_not_fail_and_prints_total_gpu_hours():
-    command = "DRYRUN=1 CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./recipes/**/*.sh"
+    command = "DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./recipes/**/*.sh"
 
     # Run the command from the project root directory
     result = subprocess.run(

From da05a056056996926413625e79eade66d15e6eb7 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 24 Apr 2025 12:22:45 -0700
Subject: [PATCH 03/19] helpful msg

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/unit/test_recipes_and_test_suites.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
index 244dde7b4e..0214d1ca5e 100644
--- a/tests/unit/test_recipes_and_test_suites.py
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -165,7 +165,9 @@ def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker):
         f"Last line of output was not as expected: '{last_line}'"
     )
     total_gpu_hours = float(last_line.split(":")[-1].strip())
-    assert total_gpu_hours <= 1024, f"Total GPU hours exceeded 1024: {last_line}"
+    assert total_gpu_hours <= 1024, (
+        f"Total GPU hours exceeded 1024: {last_line}. We should revisit the test suites to reduce the total GPU hours."
+    )
     tracker.track("total_nightly_gpu_hours", total_gpu_hours)
 
 

From 42aaaf25de1b9ee3761c74a3aae3575e289eccf4 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 24 Apr 2025 22:01:02 -0700
Subject: [PATCH 04/19] fix settings

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .../grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh    |  2 +-
 .../grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh  |  2 +-
 .../llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh    |  2 +-
 .../grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh   |  2 +-
 .../llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh    |  4 ++--
 ...sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh | 15 ++++++++-------
 .../sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh   |  2 +-
 recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh      |  8 ++++----
 .../sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh    | 10 ++++++----
 9 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
index 14df2cfe5f..3989e10c51 100755
--- a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
+++ b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
@@ -60,6 +60,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     python -u tests/check_metrics.py $JSON_METRICS \
         'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["100"] < 1.1'
+        'data["train/token_mult_prob_error"]["500"] < 1.1'
 fi
 
diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
index 466b1a41ec..0f7cf2ef76 100755
--- a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
+++ b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
@@ -64,6 +64,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     python -u tests/check_metrics.py $JSON_METRICS \
         'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["160"] < 1.1'
+        'data["train/token_mult_prob_error"]["2"] < 1.1'
 fi
 
diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
index cb8f5c9bca..6509829b0f 100755
--- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
+++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
@@ -6,7 +6,7 @@ NUM_NODES=4
 STEPS_PER_RUN=30
 MAX_STEPS=30
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=60
+NUM_MINUTES=90
 # ===== END CONFIG =====
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
index a4037b01d7..6686df40a5 100755
--- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
+++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
@@ -62,6 +62,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     python -u tests/check_metrics.py $JSON_METRICS \
         'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["160"] < 1.1'
+        'data["train/token_mult_prob_error"]["30"] < 1.1'
 fi
 
diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
index 70f834e0a8..f37dd89a1b 100755
--- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
+++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
@@ -6,12 +6,12 @@ NUM_NODES=1
 STEPS_PER_RUN=500
 MAX_STEPS=500
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=15
+NUM_MINUTES=30
 # ===== END CONFIG =====
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
+# Mark the current repo as safe, since wandb fetchs metadata about the repo/
 git config --global --add safe.directory $PROJECT_ROOT
 
 EXP_NAME=$(basename $0 .sh)
diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
index 1402a094dc..f2e2e9f2e0 100755
--- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
+++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 set -eou pipefail
 
-# TODO: @ashors real convergence run
+# TODO: @ashors real convergence run (dataset only has 2737)
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
-STEPS_PER_RUN=12000
-MAX_STEPS=12000
+STEPS_PER_RUN=2730
+MAX_STEPS=2730
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=240
+NUM_MINUTES=120
 # ===== END CONFIG =====
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
@@ -58,11 +58,12 @@ python -u examples/run_sft.py \
 # Convert tensorboard logs to json
 python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
+# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/reinforcer/issues/263
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     # TODO: FIGURE OUT CORRECT METRICS
     python -u tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 2.4' \
-        'data["train/loss"]["60"] < 0.45' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+        'data["train/loss"]["1"] < 5' \
+        'data["train/loss"]["2730"] < 0.3' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 45000'
 fi 
diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
index 3f5ce413eb..6421fed43f 100755
--- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
+++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
@@ -6,7 +6,7 @@ NUM_NODES=1
 STEPS_PER_RUN=350
 MAX_STEPS=350
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=30
+NUM_MINUTES=45
 # ===== END CONFIG =====
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
diff --git a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
index 2c6f930399..28028c1cdd 100755
--- a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+++ b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
@@ -3,8 +3,8 @@ set -eou pipefail
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
-STEPS_PER_RUN=1000
-MAX_STEPS=1000
+STEPS_PER_RUN=500
+MAX_STEPS=500
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=15
 # ===== END CONFIG =====
@@ -60,7 +60,7 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     python -u tests/check_metrics.py $JSON_METRICS \
         'data["train/loss"]["1"] < 2.4' \
-        'data["train/loss"]["60"] < 0.45' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+        'data["train/loss"]["500"] < 0.5' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 25000'
 fi
 
diff --git a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
index ad23383dc9..9a2ca8f19f 100755
--- a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
+++ b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
@@ -1,6 +1,9 @@
 #!/bin/bash
 set -eou pipefail
 
+# TODO: this config can crash on OOM
+# https://github.com/NVIDIA/reinforcer/issues/263
+
 # ===== BEGIN CONFIG =====
 NUM_NODES=4
 STEPS_PER_RUN=20  # step_time ~ 29sec
@@ -62,9 +65,8 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    # TODO: FIGURE OUT CORRECT METRICS
     python -u tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 2.4' \
-        'data["train/loss"]["60"] < 0.45' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+        'data["train/loss"]["1"] < 1.5' \
+        'data["train/loss"]["20"] < 0.3' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 35000'
 fi 

From b3a047195c2d979b1b163b05559326f275fd691c Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Apr 2025 00:26:54 -0700
Subject: [PATCH 05/19] fix step grpo-qwen2.5-7b-instruct-4n8g-fsdp1

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
index 6509829b0f..22e69c307e 100755
--- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
+++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
@@ -59,6 +59,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     python -u tests/check_metrics.py $JSON_METRICS \
         'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["160"] < 1.1'
+        'data["train/token_mult_prob_error"]["30"] < 1.1'
 fi
 

From f620160787b2a07dcc8bc01b548a6a677225f943 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Apr 2025 00:32:59 -0700
Subject: [PATCH 06/19] finalized sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
 Signed-off-by: Terry Kong <terryk@nvidia.com>

---
 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
index 6421fed43f..bcff7b5a38 100755
--- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
+++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
@@ -58,11 +58,13 @@ python -u examples/run_sft.py \
 # Convert tensorboard logs to json
 python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
+# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/reinforcer/issues/263
+
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     # TODO: FIGURE OUT CORRECT METRICS
     python -u tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 2.4' \
-        'data["train/loss"]["60"] < 0.45' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+        'data["train/loss"]["1"] < 5' \
+        'data["train/loss"]["60"] < 0.5' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 45000'
 fi 

From d5962e6770425b6c110f83fbe8a2b7b678ef0aa4 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Apr 2025 00:35:30 -0700
Subject: [PATCH 07/19] finalized
 grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
index 04a380c746..0494e8a6d9 100755
--- a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
+++ b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
@@ -64,6 +64,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     python -u tests/check_metrics.py $JSON_METRICS \
         'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["100"] < 1.1'
+        'data["train/token_mult_prob_error"]["20"] < 1.1'
 fi
 

From b8aa7f0df712c31e03008e098b2fcd76da951a9e Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Apr 2025 09:02:03 -0700
Subject: [PATCH 08/19] fix sft-llama3.1-8b-instruct-1n8g-fsdp1

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
index f37dd89a1b..a1c68aa28a 100755
--- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
+++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
@@ -3,8 +3,8 @@ set -eou pipefail
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
-STEPS_PER_RUN=500
-MAX_STEPS=500
+STEPS_PER_RUN=250
+MAX_STEPS=250
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=30
 # ===== END CONFIG =====
@@ -60,7 +60,7 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     # TODO: FIGURE OUT CORRECT METRICS
     python -u tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 2.4' \
-        'data["train/loss"]["60"] < 0.45' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 30000'
+        'data["train/loss"]["1"] < 4' \
+        'data["train/loss"]["250"] < 0.5' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 60000'
 fi 

From f656e495e2b46d0d11b86b975776ae1727396d5c Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Apr 2025 15:58:31 -0700
Subject: [PATCH 09/19] uber refactor

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 ...ama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml | 109 ++++++++++++++++++
 ...po-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml | 109 ++++++++++++++++++
 ...2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml | 109 ++++++++++++++++++
 ...-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml | 109 ++++++++++++++++++
 .../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml  | 109 ++++++++++++++++++
 ...o-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml | 109 ++++++++++++++++++
 ...n2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml | 109 ++++++++++++++++++
 .../sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml  |  67 +++++++++++
 ...ama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml |  67 +++++++++++
 ...-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml |  67 +++++++++++
 .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml    |  67 +++++++++++
 ...t-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml |  67 +++++++++++
 ...llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh |  65 -----------
 ...grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh |  65 -----------
 ...en2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh |  69 -----------
 ...po-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh |  69 -----------
 .../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh    |  64 ----------
 ...rpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh |  67 -----------
 ...wen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh |  65 -----------
 .../sft-llama3.1-8b-instruct-1n8g-fsdp1.sh    |  66 -----------
 ...llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh |  69 -----------
 ...ft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh |  70 -----------
 recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh  |  66 -----------
 ...sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh |  72 ------------
 {recipes => tests/test_suites}/README.md      |   0
 tests/test_suites/llm/common.env              |  51 ++++++++
 ...llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh |  38 ++++++
 ...grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh |  38 ++++++
 ...en2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh |  38 ++++++
 ...po-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh |  38 ++++++
 .../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh    |  38 ++++++
 ...rpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh |  38 ++++++
 ...wen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh |  38 ++++++
 .../test_suites}/llm/performance/.gitkeep     |   0
 .../sft-llama3.1-8b-instruct-1n8g-fsdp1.sh    |  39 +++++++
 ...llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh |  41 +++++++
 ...ft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh |  41 +++++++
 .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh      |  39 +++++++
 ...sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh |  41 +++++++
 39 files changed, 1616 insertions(+), 807 deletions(-)
 create mode 100644 examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml
 create mode 100644 examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml
 create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml
 create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml
 create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml
 create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml
 create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml
 create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml
 create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml
 create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml
 create mode 100644 examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml
 create mode 100644 examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml
 delete mode 100755 recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
 delete mode 100755 recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
 delete mode 100755 recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
 delete mode 100755 recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
 delete mode 100755 recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
 delete mode 100755 recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
 delete mode 100755 recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
 delete mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
 delete mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
 delete mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
 delete mode 100755 recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
 delete mode 100755 recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
 rename {recipes => tests/test_suites}/README.md (100%)
 create mode 100644 tests/test_suites/llm/common.env
 create mode 100755 tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
 create mode 100755 tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
 create mode 100755 tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
 create mode 100755 tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
 create mode 100755 tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
 create mode 100755 tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
 create mode 100755 tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
 rename {recipes => tests/test_suites}/llm/performance/.gitkeep (100%)
 create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
 create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
 create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
 create mode 100755 tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
 create mode 100755 tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh

diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml
new file mode 100644
index 0000000000..ba6ba255f3
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml
@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_rollout_turns: 1
+  max_num_steps: 500
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: meta-llama/Llama-3.1-8B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 2
+  max_total_sequence_length: 4096
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 3e-07
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 4096
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 128009
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: 4096
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 128009
+    model_name: meta-llama/Llama-3.1-8B-Instruct
+data:
+  max_input_seq_length: 4096
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml
new file mode 100644
index 0000000000..96e8e023cb
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml
@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_rollout_turns: 1
+  max_num_steps: 500
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: meta-llama/Llama-3.2-1B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.2-1B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 4
+  generation_batch_size: 32
+  logprob_batch_size: 4
+  max_total_sequence_length: 512
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 5e-06
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 512
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 128009
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: 512
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 128009
+    model_name: meta-llama/Llama-3.2-1B-Instruct
+data:
+  max_input_seq_length: 512
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml
new file mode 100644
index 0000000000..3693ac4677
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml
@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_rollout_turns: 1
+  max_num_steps: 20
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: Qwen/Qwen2.5-32B
+  tokenizer:
+    name: Qwen/Qwen2.5-32B
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 2
+  max_total_sequence_length: 16384
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: true
+    activation_checkpointing: true
+    tensor_parallel_size: 8
+  make_sequence_length_divisible_by: 8
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 3e-07
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 16384
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 151643
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 4
+      gpu_memory_utilization: 0.6
+      max_model_len: 16384
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 151643
+    model_name: Qwen/Qwen2.5-32B
+data:
+  max_input_seq_length: 16384
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 16
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml
new file mode 100644
index 0000000000..aed12183a8
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml
@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_rollout_turns: 1
+  max_num_steps: 2
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: Qwen/Qwen2.5-32B
+  tokenizer:
+    name: Qwen/Qwen2.5-32B
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 2
+  max_total_sequence_length: 16384
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: true
+    activation_checkpointing: true
+    tensor_parallel_size: 8
+  make_sequence_length_divisible_by: 8
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 3e-07
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 16384
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 151643
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 4
+      gpu_memory_utilization: 0.6
+      max_model_len: 16384
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 151643
+    model_name: Qwen/Qwen2.5-32B
+data:
+  max_input_seq_length: 16384
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 16
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml
new file mode 100644
index 0000000000..27211ddc7e
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml
@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_rollout_turns: 1
+  max_num_steps: 30
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp1
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: Qwen/Qwen2.5-7B-Instruct
+  tokenizer:
+    name: Qwen/Qwen2.5-7B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 2
+  max_total_sequence_length: 4096
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: false
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 3e-07
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 4096
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 151645
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: 4096
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 151643
+    model_name: Qwen/Qwen2.5-7B-Instruct
+data:
+  max_input_seq_length: 4096
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-fsdp1
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen2.5-7b-instruct-4n8g-fsdp1
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml
new file mode 100644
index 0000000000..87e2c592c0
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml
@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+  max_rollout_turns: 1
+  max_num_steps: 30
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: Qwen/Qwen2.5-7B-Instruct
+  tokenizer:
+    name: Qwen/Qwen2.5-7B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 1
+  generation_batch_size: 32
+  logprob_batch_size: 2
+  max_total_sequence_length: 4096
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: true
+    activation_checkpointing: false
+    tensor_parallel_size: 4
+  make_sequence_length_divisible_by: 4
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 3e-07
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 4096
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 151645
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 4
+      gpu_memory_utilization: 0.6
+      max_model_len: 4096
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 151643
+    model_name: Qwen/Qwen2.5-7B-Instruct
+data:
+  max_input_seq_length: 4096
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 4
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml
new file mode 100644
index 0000000000..9f5762f173
--- /dev/null
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml
@@ -0,0 +1,109 @@
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_rollout_turns: 1
+  max_num_steps: 450
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  max_val_samples: 256
+  val_batch_size: 256
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1
+  metric_name: val_reward
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: Qwen/Qwen2.5-Math-1.5B-Instruct
+  tokenizer:
+    name: Qwen/Qwen2.5-Math-1.5B-Instruct
+  train_global_batch_size: 512
+  train_micro_batch_size: 4
+  generation_batch_size: 32
+  logprob_batch_size: 4
+  max_total_sequence_length: 512
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  refit_buffer_size_gb: 4
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 5e-06
+      weight_decay: 0.01
+      betas:
+        - 0.9
+        - 0.999
+      eps: 1e-08
+      foreach: false
+      fused: false
+  scheduler:
+    - name: torch.optim.lr_scheduler.LinearLR
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1
+        total_iters: 50
+    - name: torch.optim.lr_scheduler.ConstantLR
+      kwargs:
+        factor: 1
+        total_iters: 10000000000
+    - milestones:
+        - 50
+  generation:
+    backend: vllm
+    max_new_tokens: 512
+    temperature: 1
+    top_p: 1
+    top_k: null
+    stop_token_ids:
+      - 151645
+    stop_strings: null
+    vllm_cfg:
+      tensor_parallel_size: 1
+      gpu_memory_utilization: 0.6
+      max_model_len: 512
+      load_format: dummy
+      skip_tokenizer_init: true
+    pad_token_id: 151643
+    model_name: Qwen/Qwen2.5-Math-1.5B-Instruct
+data:
+  max_input_seq_length: 512
+  prompt_file: examples/prompts/cot.txt
+  system_prompt_file: null
+  dataset_name: OpenMathInstruct-2
+env:
+  math:
+    num_workers: 8
+logger:
+  log_dir: logs/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1
+  num_val_samples_to_print: 0
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1
+  tensorboard: {}
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml
new file mode 100644
index 0000000000..da0140a73e
--- /dev/null
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml
@@ -0,0 +1,67 @@
+sft:
+  max_num_epochs: 1
+  max_num_steps: 250
+  val_period: 10
+  val_batches: 8
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp1
+  metric_name: val_loss
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: meta-llama/Llama-3.1-8B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
+  train_global_batch_size: 32
+  train_micro_batch_size: 1
+  max_total_sequence_length: 1024
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  dtensor_cfg:
+    enabled: false
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 5e-06
+      weight_decay: 0.1
+      betas:
+        - 0.9
+        - 0.98
+      eps: 1e-05
+      foreach: false
+      fused: false
+data:
+  max_input_seq_length: 1024
+  dataset_name: squad
+  add_bos: true
+  add_eos: true
+logger:
+  log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp1
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: sft-llama3.1-8b-instruct-1n8g-fsdp1
+  tensorboard:
+    log_dir: tb_logs-sft-dev-squad
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml
new file mode 100644
index 0000000000..288f365c1a
--- /dev/null
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml
@@ -0,0 +1,67 @@
+sft:
+  max_num_epochs: 1
+  max_num_steps: 2730
+  val_period: 10
+  val_batches: 8
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
+  metric_name: val_loss
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: meta-llama/Llama-3.1-8B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
+  train_global_batch_size: 32
+  train_micro_batch_size: 1
+  max_total_sequence_length: 1024
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 5e-06
+      weight_decay: 0.1
+      betas:
+        - 0.9
+        - 0.98
+      eps: 1e-05
+      foreach: false
+      fused: false
+data:
+  max_input_seq_length: 1024
+  dataset_name: squad
+  add_bos: true
+  add_eos: true
+logger:
+  log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long
+  tensorboard:
+    log_dir: tb_logs-sft-dev-squad
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml
new file mode 100644
index 0000000000..f065b5cd34
--- /dev/null
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml
@@ -0,0 +1,67 @@
+sft:
+  max_num_epochs: 1
+  max_num_steps: 350
+  val_period: 10
+  val_batches: 8
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
+  metric_name: val_loss
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: meta-llama/Llama-3.1-8B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.1-8B-Instruct
+    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
+  train_global_batch_size: 32
+  train_micro_batch_size: 1
+  max_total_sequence_length: 1024
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: true
+    activation_checkpointing: false
+    tensor_parallel_size: 2
+  make_sequence_length_divisible_by: 2
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 5e-06
+      weight_decay: 0.1
+      betas:
+        - 0.9
+        - 0.98
+      eps: 1e-05
+      foreach: false
+      fused: false
+data:
+  max_input_seq_length: 1024
+  dataset_name: squad
+  add_bos: true
+  add_eos: true
+logger:
+  log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp
+  tensorboard:
+    log_dir: tb_logs-sft-dev-squad
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml
new file mode 100644
index 0000000000..7c4bd357ed
--- /dev/null
+++ b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml
@@ -0,0 +1,67 @@
+sft:
+  max_num_epochs: 1
+  max_num_steps: 500
+  val_period: 10
+  val_batches: 8
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/sft-llama3.2-1b-1n8g-fsdp2tp1
+  metric_name: val_loss
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: meta-llama/Llama-3.2-1B
+  tokenizer:
+    name: meta-llama/Llama-3.2-1B
+    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
+  train_global_batch_size: 32
+  train_micro_batch_size: 1
+  max_total_sequence_length: 1024
+  precision: float32
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+  make_sequence_length_divisible_by: 1
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 5e-06
+      weight_decay: 0.1
+      betas:
+        - 0.9
+        - 0.98
+      eps: 1e-05
+      foreach: false
+      fused: false
+data:
+  max_input_seq_length: 1024
+  dataset_name: squad
+  add_bos: true
+  add_eos: true
+logger:
+  log_dir: logs/sft-llama3.2-1b-1n8g-fsdp2tp1
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: sft-llama3.2-1b-1n8g-fsdp2tp1
+  tensorboard:
+    log_dir: tb_logs-sft-dev-squad
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml
new file mode 100644
index 0000000000..4cd1a5387c
--- /dev/null
+++ b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml
@@ -0,0 +1,67 @@
+sft:
+  max_num_epochs: 1
+  max_num_steps: 20
+  val_period: 10
+  val_batches: 8
+  val_global_batch_size: 32
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+checkpointing:
+  enabled: true
+  checkpoint_dir: results/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
+  metric_name: val_loss
+  higher_is_better: false
+  keep_top_k: 3
+  save_period: 10
+policy:
+  model_name: Qwen/Qwen2.5-32B
+  tokenizer:
+    name: Qwen/Qwen2.5-32B
+    chat_template: '{% for message in messages %}{%- if message[''role''] == ''system''  %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user''  %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant''  %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}'
+  train_global_batch_size: 32
+  train_micro_batch_size: 1
+  max_total_sequence_length: 16000
+  precision: bfloat16
+  fsdp_offload_enabled: false
+  activation_checkpointing_enabled: false
+  dtensor_cfg:
+    enabled: true
+    cpu_offload: false
+    sequence_parallel: true
+    activation_checkpointing: true
+    tensor_parallel_size: 8
+  make_sequence_length_divisible_by: 8
+  max_grad_norm: 1
+  optimizer:
+    name: torch.optim.AdamW
+    kwargs:
+      lr: 5e-06
+      weight_decay: 0.1
+      betas:
+        - 0.9
+        - 0.98
+      eps: 1e-05
+      foreach: false
+      fused: false
+data:
+  max_input_seq_length: 16000
+  dataset_name: squad
+  add_bos: true
+  add_eos: true
+logger:
+  log_dir: logs/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
+  wandb_enabled: true
+  tensorboard_enabled: true
+  monitor_gpus: true
+  wandb:
+    project: nemo-rl
+    name: sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt
+  tensorboard:
+    log_dir: tb_logs-sft-dev-squad
+  gpu_monitoring:
+    collection_interval: 10
+    flush_interval: 10
+cluster:
+  gpus_per_node: 8
+  num_nodes: 4
diff --git a/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
deleted file mode 100755
index 3feb431a2f..0000000000
--- a/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=4
-STEPS_PER_RUN=100
-MAX_STEPS=500
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=240
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_grpo_math.py \
-    --config examples/configs/grpo_math_8B.yaml \
-    policy.model_name=meta-llama/Llama-3.1-8B-Instruct \
-    policy.dtensor_cfg.enabled=true \
-    policy.dtensor_cfg.tensor_parallel_size=1 \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    grpo.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["100"] < 1.1'
-fi
-
diff --git a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
deleted file mode 100755
index 3989e10c51..0000000000
--- a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=1
-STEPS_PER_RUN=500
-MAX_STEPS=500
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=120
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_grpo_math.py \
-    --config examples/configs/grpo_math_1B.yaml \
-    policy.model_name=meta-llama/Llama-3.2-1B-Instruct \
-    policy.dtensor_cfg.enabled=true \
-    policy.dtensor_cfg.tensor_parallel_size=1 \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    grpo.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["500"] < 1.1'
-fi
-
diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
deleted file mode 100755
index 0494e8a6d9..0000000000
--- a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=16
-STEPS_PER_RUN=10
-MAX_STEPS=20
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=240
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_grpo_math.py \
-    --config examples/configs/grpo_math_8B.yaml \
-    policy.model_name=Qwen/Qwen2.5-32B \
-    policy.dtensor_cfg.enabled=True \
-    policy.dtensor_cfg.tensor_parallel_size=8 \
-    policy.dtensor_cfg.sequence_parallel=True \
-    policy.dtensor_cfg.activation_checkpointing=True \
-    policy.generation.vllm_cfg.tensor_parallel_size=4 \
-    policy.max_total_sequence_length=16384 \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    grpo.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["20"] < 1.1'
-fi
-
diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
deleted file mode 100755
index 0f7cf2ef76..0000000000
--- a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=16
-STEPS_PER_RUN=2  # 40min: step_time: [1341, 801]
-MAX_STEPS=2
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=60
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_grpo_math.py \
-    --config examples/configs/grpo_math_8B.yaml \
-    policy.model_name=Qwen/Qwen2.5-32B \
-    policy.dtensor_cfg.enabled=True \
-    policy.dtensor_cfg.tensor_parallel_size=8 \
-    policy.dtensor_cfg.sequence_parallel=True \
-    policy.dtensor_cfg.activation_checkpointing=True \
-    policy.generation.vllm_cfg.tensor_parallel_size=4 \
-    policy.max_total_sequence_length=16384 \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    grpo.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["2"] < 1.1'
-fi
-
diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
deleted file mode 100755
index 22e69c307e..0000000000
--- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=4
-STEPS_PER_RUN=30
-MAX_STEPS=30
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=90
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_grpo_math.py \
-    --config examples/configs/grpo_math_8B.yaml \
-    policy.model_name=Qwen/Qwen2.5-7B-Instruct \
-    policy.dtensor_cfg.enabled=false \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    grpo.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["30"] < 1.1'
-fi
-
diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
deleted file mode 100755
index 6686df40a5..0000000000
--- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=4
-STEPS_PER_RUN=30
-MAX_STEPS=30
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=180
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_grpo_math.py \
-    --config examples/configs/grpo_math_8B.yaml \
-    policy.model_name=Qwen/Qwen2.5-7B-Instruct \
-    policy.dtensor_cfg.enabled=True \
-    policy.dtensor_cfg.tensor_parallel_size=4 \
-    policy.dtensor_cfg.sequence_parallel=True \
-    policy.generation.vllm_cfg.tensor_parallel_size=4 \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    grpo.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["30"] < 1.1'
-fi
-
diff --git a/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
deleted file mode 100755
index 89bf673983..0000000000
--- a/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=1
-STEPS_PER_RUN=450
-MAX_STEPS=450
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=120
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_grpo_math.py \
-    --config examples/configs/grpo_math_1B.yaml \
-    policy.model_name=Qwen/Qwen2.5-Math-1.5B-Instruct \
-    policy.dtensor_cfg.enabled=true \
-    policy.dtensor_cfg.tensor_parallel_size=1 \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    grpo.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["100"] < 1.1'
-fi
-
diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
deleted file mode 100755
index a1c68aa28a..0000000000
--- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=1
-STEPS_PER_RUN=250
-MAX_STEPS=250
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=30
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo/
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_sft.py \
-    --config examples/configs/sft.yaml \
-    policy.model_name=meta-llama/Llama-3.1-8B-Instruct \
-    policy.precision=bfloat16 \
-    policy.dtensor_cfg.enabled=False \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    sft.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    # TODO: FIGURE OUT CORRECT METRICS
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 4' \
-        'data["train/loss"]["250"] < 0.5' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 60000'
-fi 
diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
deleted file mode 100755
index f2e2e9f2e0..0000000000
--- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# TODO: @ashors real convergence run (dataset only has 2737)
-# ===== BEGIN CONFIG =====
-NUM_NODES=1
-STEPS_PER_RUN=2730
-MAX_STEPS=2730
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=120
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_sft.py \
-    --config examples/configs/sft.yaml \
-    policy.model_name=meta-llama/Llama-3.1-8B-Instruct \
-    policy.precision=bfloat16 \
-    policy.dtensor_cfg.enabled=True \
-    policy.dtensor_cfg.tensor_parallel_size=1 \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    sft.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/reinforcer/issues/263
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    # TODO: FIGURE OUT CORRECT METRICS
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 5' \
-        'data["train/loss"]["2730"] < 0.3' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 45000'
-fi 
diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
deleted file mode 100755
index bcff7b5a38..0000000000
--- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=1
-STEPS_PER_RUN=350
-MAX_STEPS=350
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=45
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_sft.py \
-    --config examples/configs/sft.yaml \
-    policy.model_name=meta-llama/Llama-3.1-8B-Instruct \
-    policy.precision=bfloat16 \
-    policy.dtensor_cfg.enabled=True \
-    policy.dtensor_cfg.tensor_parallel_size=2 \
-    policy.dtensor_cfg.sequence_parallel=True \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    sft.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/reinforcer/issues/263
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    # TODO: FIGURE OUT CORRECT METRICS
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 5' \
-        'data["train/loss"]["60"] < 0.5' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 45000'
-fi 
diff --git a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
deleted file mode 100755
index 28028c1cdd..0000000000
--- a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=1
-STEPS_PER_RUN=500
-MAX_STEPS=500
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=15
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_sft.py \
-    --config examples/configs/sft.yaml \
-    policy.model_name=meta-llama/Llama-3.2-1B \
-    policy.dtensor_cfg.enabled=True \
-    policy.dtensor_cfg.tensor_parallel_size=1 \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    sft.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 2.4' \
-        'data["train/loss"]["500"] < 0.5' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 25000'
-fi
-
diff --git a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
deleted file mode 100755
index 9a2ca8f19f..0000000000
--- a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-set -eou pipefail
-
-# TODO: this config can crash on OOM
-# https://github.com/NVIDIA/reinforcer/issues/263
-
-# ===== BEGIN CONFIG =====
-NUM_NODES=4
-STEPS_PER_RUN=20  # step_time ~ 29sec
-MAX_STEPS=20
-NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
-NUM_MINUTES=30
-# ===== END CONFIG =====
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
-
-EXP_NAME=$(basename $0 .sh)
-EXP_DIR=$SCRIPT_DIR/$EXP_NAME
-LOG_DIR=$EXP_DIR/logs
-CKPT_DIR=$EXP_DIR/ckpts
-JSON_METRICS=$EXP_DIR/metrics.json
-RUN_LOG=$EXP_DIR/run.log
-export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
-
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
-
-# Early stopping to save compute if max step has been reached
-STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
-if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
-    echo "[INFO] Target step $MAX_STEPS reached, skipping run"
-    exit 0
-fi
-echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
-
-# Run the experiment
-cd $PROJECT_ROOT
-python -u examples/run_sft.py \
-    --config examples/configs/sft.yaml \
-    policy.model_name=Qwen/Qwen2.5-32B \
-    policy.precision=bfloat16 \
-    policy.dtensor_cfg.enabled=True \
-    policy.dtensor_cfg.tensor_parallel_size=8 \
-    policy.dtensor_cfg.sequence_parallel=True \
-    policy.dtensor_cfg.activation_checkpointing=True \
-    policy.max_total_sequence_length=16000 \
-    cluster.num_nodes=$NUM_NODES \
-    cluster.gpus_per_node=8 \
-    sft.max_num_steps=$MAX_STEPS \
-    logger.log_dir=$LOG_DIR \
-    logger.wandb_enabled=True \
-    logger.wandb.project=nemo-rl \
-    logger.wandb.name=$EXP_NAME \
-    logger.monitor_gpus=True \
-    logger.tensorboard_enabled=True \
-    checkpointing.enabled=True \
-    checkpointing.checkpoint_dir=$CKPT_DIR \
-    $@ \
-    2>&1 | tee $RUN_LOG
-
-# Convert tensorboard logs to json
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
-
-# Only run metrics if the target step is reached
-if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
-    python -u tests/check_metrics.py $JSON_METRICS \
-        'data["train/loss"]["1"] < 1.5' \
-        'data["train/loss"]["20"] < 0.3' \
-        'max(data["ray/node.0.gpu.0.memory"]) < 35000'
-fi 
diff --git a/recipes/README.md b/tests/test_suites/README.md
similarity index 100%
rename from recipes/README.md
rename to tests/test_suites/README.md
diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env
new file mode 100644
index 0000000000..2fc1bb27be
--- /dev/null
+++ b/tests/test_suites/llm/common.env
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Source this file before running test to setup
+#
+#   source ./common.env
+set -eou pipefail
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+# Mark the current repo as safe, since wandb fetchs metadata about the repo
+git config --global --add safe.directory $PROJECT_ROOT
+PROJECT_ROOT=$(git rev-parse --show-toplevel)
+
+get_max_steps_from_yaml() {
+  local top_key="$1"
+  local yaml_path="$2"
+  # Use python -c to import yaml, load the file, and print the specific value
+  # Access the dictionary using the provided top_key
+  python -c "import yaml; f=open('$yaml_path', 'r'); data=yaml.safe_load(f); print(data['$top_key']['max_num_steps']); f.close()"
+}
+
+exit_if_max_steps_reached() {
+  # Early stopping to save compute if max step has been reached
+  STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
+  if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then
+      echo "[INFO] Target step $MAX_STEPS reached, skipping run"
+      exit 0
+  fi
+  echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps"
+}
+
+EXP_NAME=$(basename $0 .sh)
+ALGO=$(cut -d'-' -f1 <<< $EXP_NAME)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+CKPT_DIR=$EXP_DIR/ckpts
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
+
+# Test script has path:  tests/test_suites/llm/${EXP_NAME}.sh
+# where config has path:  examples/configs/recipes/llm/${EXP_NAME}.yaml
+# We will assume/check the path matches this pattern
+CONFIG_PATH=$(echo $SCRIPT_DIR/${EXP_NAME}.yaml | sed 's#tests/test_suites/llm#examples/configs/recipes/llm#')
+if [[ ! -f $CONFIG_PATH ]]; then
+  echo "[ERROR] Config file $CONFIG_PATH not found"
+  exit 1
+fi
+
+export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
+
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+
+MAX_STEPS=$(get_max_steps_from_yaml $ALGO $CONFIG_PATH)
\ No newline at end of file
diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
new file mode 100755
index 0000000000..b0f81e9886
--- /dev/null
+++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=100
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["100"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
new file mode 100755
index 0000000000..3cf1a34eb5
--- /dev/null
+++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=500
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["500"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
new file mode 100755
index 0000000000..59b1d4f7d3
--- /dev/null
+++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=16
+STEPS_PER_RUN=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["20"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
new file mode 100755
index 0000000000..bf1dba7e85
--- /dev/null
+++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=16
+STEPS_PER_RUN=2  # 40min: step_time: [1341, 801]
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=60
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["2"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
new file mode 100755
index 0000000000..7e9ef5b050
--- /dev/null
+++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=90
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["30"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
new file mode 100755
index 0000000000..ba3fe1dd52
--- /dev/null
+++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=30
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=180
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["30"] < 1.1'
+fi
+
diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
new file mode 100755
index 0000000000..6f78bce178
--- /dev/null
+++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=450
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["100"] < 1.1'
+fi
+
diff --git a/recipes/llm/performance/.gitkeep b/tests/test_suites/llm/performance/.gitkeep
similarity index 100%
rename from recipes/llm/performance/.gitkeep
rename to tests/test_suites/llm/performance/.gitkeep
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
new file mode 100755
index 0000000000..048de3418c
--- /dev/null
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=250
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=30
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    # TODO: FIGURE OUT CORRECT METRICS
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 4' \
+        'data["train/loss"]["250"] < 0.5' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 60000'
+fi 
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
new file mode 100755
index 0000000000..f963a6c55d
--- /dev/null
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# TODO: @ashors real convergence run (dataset only has 2737)
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=2730
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=120
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/reinforcer/issues/263
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    # TODO: FIGURE OUT CORRECT METRICS
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 5' \
+        'data["train/loss"]["2730"] < 0.3' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 45000'
+fi 
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
new file mode 100755
index 0000000000..2bec6f9c8e
--- /dev/null
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=350
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=45
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/reinforcer/issues/263
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    # TODO: FIGURE OUT CORRECT METRICS
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 5' \
+        'data["train/loss"]["60"] < 0.5' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 45000'
+fi 
diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
new file mode 100755
index 0000000000..2373889ecc
--- /dev/null
+++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=1
+STEPS_PER_RUN=500
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=15
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 2.4' \
+        'data["train/loss"]["500"] < 0.5' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 25000'
+fi
+
diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
new file mode 100755
index 0000000000..f7a9050c9c
--- /dev/null
+++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# TODO: this config can crash on OOM
+# https://github.com/NVIDIA/reinforcer/issues/263
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=4
+STEPS_PER_RUN=20  # step_time ~ 29sec
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=30
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["1"] < 1.5' \
+        'data["train/loss"]["20"] < 0.3' \
+        'max(data["ray/node.0.gpu.0.memory"]) < 35000'
+fi 

From 1db320b9457d7e9bff0508e450b33509190bcc0d Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Apr 2025 16:02:44 -0700
Subject: [PATCH 10/19] add dryrun backdoor for tests

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/test_suites/llm/common.env | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env
index 2fc1bb27be..af00aae410 100644
--- a/tests/test_suites/llm/common.env
+++ b/tests/test_suites/llm/common.env
@@ -46,6 +46,11 @@ fi
 
 export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
 
-mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR
+MAX_STEPS=$(get_max_steps_from_yaml $ALGO $CONFIG_PATH)
+
+if [[ -n "$DRYRUN" ]]; then
+  echo "[INFO] DRYRUN mode: used for testing"
+  exit
+fi
 
-MAX_STEPS=$(get_max_steps_from_yaml $ALGO $CONFIG_PATH)
\ No newline at end of file
+mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR

From 990ca45ab1b653a2a0fa36f00db7719d0616245b Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Apr 2025 16:29:21 -0700
Subject: [PATCH 11/19] revert maxsteps

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/test_suites/llm/common.env                    | 13 +------------
 .../grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh |  2 ++
 .../llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh  |  2 ++
 ...rpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh |  2 ++
 .../grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh    |  2 ++
 .../llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh      |  2 ++
 .../llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh |  2 ++
 ...grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh |  4 +++-
 .../llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh      |  2 ++
 .../sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh  |  2 ++
 .../llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh |  4 +++-
 .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh            |  2 ++
 .../llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh  |  2 ++
 13 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env
index af00aae410..870b5af6bc 100644
--- a/tests/test_suites/llm/common.env
+++ b/tests/test_suites/llm/common.env
@@ -9,14 +9,6 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 git config --global --add safe.directory $PROJECT_ROOT
 PROJECT_ROOT=$(git rev-parse --show-toplevel)
 
-get_max_steps_from_yaml() {
-  local top_key="$1"
-  local yaml_path="$2"
-  # Use python -c to import yaml, load the file, and print the specific value
-  # Access the dictionary using the provided top_key
-  python -c "import yaml; f=open('$yaml_path', 'r'); data=yaml.safe_load(f); print(data['$top_key']['max_num_steps']); f.close()"
-}
-
 exit_if_max_steps_reached() {
   # Early stopping to save compute if max step has been reached
   STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0)
@@ -28,7 +20,6 @@ exit_if_max_steps_reached() {
 }
 
 EXP_NAME=$(basename $0 .sh)
-ALGO=$(cut -d'-' -f1 <<< $EXP_NAME)
 EXP_DIR=$SCRIPT_DIR/$EXP_NAME
 LOG_DIR=$EXP_DIR/logs
 CKPT_DIR=$EXP_DIR/ckpts
@@ -38,7 +29,7 @@ RUN_LOG=$EXP_DIR/run.log
 # Test script has path:  tests/test_suites/llm/${EXP_NAME}.sh
 # where config has path:  examples/configs/recipes/llm/${EXP_NAME}.yaml
 # We will assume/check the path matches this pattern
-CONFIG_PATH=$(echo $SCRIPT_DIR/${EXP_NAME}.yaml | sed 's#tests/test_suites/llm#examples/configs/recipes/llm#')
+CONFIG_PATH=$(echo $SCRIPT_DIR/${EXP_NAME}.yaml | sed 's#tests/test_suites#examples/configs/recipes#')
 if [[ ! -f $CONFIG_PATH ]]; then
   echo "[ERROR] Config file $CONFIG_PATH not found"
   exit 1
@@ -46,8 +37,6 @@ fi
 
 export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
 
-MAX_STEPS=$(get_max_steps_from_yaml $ALGO $CONFIG_PATH)
-
 if [[ -n "$DRYRUN" ]]; then
   echo "[INFO] DRYRUN mode: used for testing"
   exit
diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
index b0f81e9886..6e64876058 100755
--- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
+++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=4
 STEPS_PER_RUN=100
+MAX_STEPS=500
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=240
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_grpo_math.py \
     --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
index 3cf1a34eb5..45cfad6e83 100755
--- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
+++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
 STEPS_PER_RUN=500
+MAX_STEPS=500
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=120
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_grpo_math.py \
     --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
index 59b1d4f7d3..69c9899ccd 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=16
 STEPS_PER_RUN=10
+MAX_STEPS=20
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=240
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_grpo_math.py \
     --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
index bf1dba7e85..ccdef1b2bd 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=16
 STEPS_PER_RUN=2  # 40min: step_time: [1341, 801]
+MAX_STEPS=2
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=60
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_grpo_math.py \
     --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
index 7e9ef5b050..49c96a6f58 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=4
 STEPS_PER_RUN=30
+MAX_STEPS=30
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=90
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_grpo_math.py \
     --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
index ba3fe1dd52..b3071fb58e 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=4
 STEPS_PER_RUN=30
+MAX_STEPS=30
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=180
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_grpo_math.py \
     --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
index 6f78bce178..98df00c25c 100755
--- a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
+++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
 STEPS_PER_RUN=450
+MAX_STEPS=450
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=120
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_grpo_math.py \
     --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
@@ -33,6 +35,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
         'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["100"] < 1.1'
+        'data["train/token_mult_prob_error"]["450"] < 1.1'
 fi
 
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
index 048de3418c..1e51c2a78f 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
 STEPS_PER_RUN=250
+MAX_STEPS=250
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=30
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_sft.py \
     --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
index f963a6c55d..1f937018a3 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
@@ -6,6 +6,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
 STEPS_PER_RUN=2730
+MAX_STEPS=2730
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=120
 # ===== END CONFIG =====
@@ -16,6 +17,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_sft.py \
     --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
index 2bec6f9c8e..2379681138 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
 STEPS_PER_RUN=350
+MAX_STEPS=350
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=45
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_sft.py \
     --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
@@ -36,6 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma
     # TODO: FIGURE OUT CORRECT METRICS
     uv run tests/check_metrics.py $JSON_METRICS \
         'data["train/loss"]["1"] < 5' \
-        'data["train/loss"]["60"] < 0.5' \
+        'data["train/loss"]["350"] < 0.5' \
         'max(data["ray/node.0.gpu.0.memory"]) < 45000'
 fi 
diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
index 2373889ecc..24b966c2af 100755
--- a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
@@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
 STEPS_PER_RUN=500
+MAX_STEPS=500
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=15
 # ===== END CONFIG =====
@@ -15,6 +16,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_sft.py \
     --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \
diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
index f7a9050c9c..fd40a85764 100755
--- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
+++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
@@ -8,6 +8,7 @@ source $SCRIPT_DIR/common.env
 # ===== BEGIN CONFIG =====
 NUM_NODES=4
 STEPS_PER_RUN=20  # step_time ~ 29sec
+MAX_STEPS=20
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=30
 # ===== END CONFIG =====
@@ -18,6 +19,7 @@ exit_if_max_steps_reached
 cd $PROJECT_ROOT
 uv run examples/run_sft.py \
     --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
     logger.wandb.project=nemo-rl \

From 33dd63484e443ec3f9f8aedb82bf870eaf724d52 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 25 Apr 2025 17:03:06 -0700
Subject: [PATCH 12/19] fix tests

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/test_suites/llm/common.env           |  5 ++--
 tests/test_suites/nightly.txt              | 18 +++++++-------
 tests/test_suites/release.txt              |  6 ++---
 tests/unit/test_recipes_and_test_suites.py | 28 ++++++++++++++++------
 tools/launch                               |  8 +++----
 5 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env
index 870b5af6bc..918b7bff1b 100644
--- a/tests/test_suites/llm/common.env
+++ b/tests/test_suites/llm/common.env
@@ -5,8 +5,9 @@
 set -eou pipefail
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
-# Mark the current repo as safe, since wandb fetchs metadata about the repo
-git config --global --add safe.directory $PROJECT_ROOT
+# Mark all repos as safe in the test context, since wandb fetchs metadata about the repo and it's a
+# catch-22 to get the project root and mark it safe if you don't know the project root
+git config --global --add safe.directory "*"
 PROJECT_ROOT=$(git rev-parse --show-toplevel)
 
 exit_if_max_steps_reached() {
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
index 9b4eac9491..4c609d5bff 100644
--- a/tests/test_suites/nightly.txt
+++ b/tests/test_suites/nightly.txt
@@ -3,26 +3,26 @@
 ########
 
 # Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much
-recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
-recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
+tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
+tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
 
 # FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct)
-recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
-recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
+tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
+tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
 
 # Functional 32b run
-recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
+tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
 
 #######
 # SFT #
 #######
 
 # 1N 1B/8B runs
-recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
 
 # Dtensor vs fsdp1 (8B)
-recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
-recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
+tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
+tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh
 
 # Functional 32b test
-recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
+tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt
index dfc997435b..69735cb0cb 100644
--- a/tests/test_suites/release.txt
+++ b/tests/test_suites/release.txt
@@ -3,14 +3,14 @@
 ########
 
 # Long 8b run
-recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
+tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
 
 # Long 32b run
-recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
+tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
 
 #######
 # SFT #
 #######
 
 # Long 8b convergence
-recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
\ No newline at end of file
+tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
\ No newline at end of file
diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
index 0214d1ca5e..5081692ea1 100644
--- a/tests/unit/test_recipes_and_test_suites.py
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -18,8 +18,6 @@
 
 dir_path = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.abspath(os.path.join(dir_path, "..", ".."))
-recipes_dir = os.path.join(project_root, "recipes")
-
 test_suites_dir = os.path.join(project_root, "tests", "test_suites")
 
 nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt")
@@ -120,15 +118,15 @@ def test_no_overlap_across_test_suites(all_test_suites):
 def test_all_recipes_accounted_for_in_test_suites(all_test_suites):
     all_recipes_in_test_suites = set(all_test_suites)
 
-    all_recipes_in_recipes_dir = set()
+    all_tests_in_test_suites_dir = set()
     for recipe_path in glob.glob(
-        os.path.join(recipes_dir, "**", "*.sh"), recursive=True
+        os.path.join(test_suites_dir, "**", "*.sh"), recursive=True
     ):
         # Strip off the project root and leading slash
         recipe_name = recipe_path[len(project_root) + 1 :]
-        all_recipes_in_recipes_dir.add(recipe_name)
+        all_tests_in_test_suites_dir.add(recipe_name)
 
-    assert all_recipes_in_test_suites == all_recipes_in_recipes_dir, (
+    assert all_recipes_in_test_suites == all_tests_in_test_suites_dir, (
         "All recipes are not accounted for in the test suites"
     )
 
@@ -172,7 +170,7 @@ def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker):
 
 
 def test_dry_run_does_not_fail_and_prints_total_gpu_hours():
-    command = "DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./recipes/**/*.sh"
+    command = "DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./tests/test_suites/**/*.sh"
 
     # Run the command from the project root directory
     result = subprocess.run(
@@ -200,3 +198,19 @@ def test_dry_run_does_not_fail_and_prints_total_gpu_hours():
     assert last_line.startswith("[INFO]: Total GPU hours:"), (
         f"Last line of output was not as expected: '{last_line}'"
     )
+
+
+def test_all_tests_can_find_config_if_dryrun(all_test_suites):
+    for test_suite in all_test_suites:
+        command = f"DRYRUN=1 {test_suite}"
+        result = subprocess.run(
+            command,
+            shell=True,
+            cwd=project_root,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        assert result.returncode == 0, (
+            f"Command failed with exit code {result.returncode}"
+        )
diff --git a/tools/launch b/tools/launch
index 1db03b3b03..4c76cee78d 100755
--- a/tools/launch
+++ b/tools/launch
@@ -40,11 +40,11 @@ extract_config() {
         echo "[ERROR]: Please add and update a section in the script with these variables:"
         echo
         echo "# ===== BEGIN CONFIG ====="
-        echo "NUM_NODES=1"       # How many nodes this job uses
-        echo "STEPS_PER_RUN=60"  # Approximately how many steps reached in one job
-        echo "MAX_STEPS=60"      # Max training steps
+        echo "NUM_NODES=1        # How many nodes this job uses"
+        echo "STEPS_PER_RUN=60   # Approximately how many steps reached in one job"
+        echo "MAX_STEPS=60       # Max training steps"
         echo 'NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up'
-        echo "NUM_MINUTES=240"   # How many minutes one job is (SLURM specific)
+        echo "NUM_MINUTES=240    # How many minutes one job is (SLURM specific)"
         echo "# ===== END CONFIG ====="
         return 1
     fi 1>&2

From 7d347f62e08beaaf96963cf38a551cd62da60e4e Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Sat, 26 Apr 2025 00:00:07 -0700
Subject: [PATCH 13/19] test dryrun to not conflict

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/test_suites/llm/common.env           | 4 ++--
 tests/unit/test_recipes_and_test_suites.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env
index 918b7bff1b..c2008292b9 100644
--- a/tests/test_suites/llm/common.env
+++ b/tests/test_suites/llm/common.env
@@ -38,8 +38,8 @@ fi
 
 export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
 
-if [[ -n "$DRYRUN" ]]; then
-  echo "[INFO] DRYRUN mode: used for testing"
+if [[ -n "${TEST_DRYRUN:-}" ]]; then
+  echo "[INFO] TEST_DRYRUN mode: used for testing"
   exit
 fi
 
diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
index 5081692ea1..edceba3649 100644
--- a/tests/unit/test_recipes_and_test_suites.py
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -202,7 +202,7 @@ def test_dry_run_does_not_fail_and_prints_total_gpu_hours():
 
 def test_all_tests_can_find_config_if_dryrun(all_test_suites):
     for test_suite in all_test_suites:
-        command = f"DRYRUN=1 {test_suite}"
+        command = f"TEST_DRYRUN=1 {test_suite}"
         result = subprocess.run(
             command,
             shell=True,

From b67c558cb09a67d9153c30a95e5868b1294131dd Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Sun, 27 Apr 2025 22:47:56 -0700
Subject: [PATCH 14/19] fix up packaging script to globstar

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tools/package_release_runs.sh | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/tools/package_release_runs.sh b/tools/package_release_runs.sh
index bf8aa1befc..bf8281d154 100755
--- a/tools/package_release_runs.sh
+++ b/tools/package_release_runs.sh
@@ -10,11 +10,26 @@ PROJECT_ROOT=$(realpath $SCRIPT_DIR/..)
 cd $PROJECT_ROOT
 
 set -eou pipefail
+# Enable recursive globbing
+shopt -s globstar
+
+OUTPUT_TAR="release_runs-$(git rev-parse --short HEAD).tar.gz"
+
+# Check if the glob expanded to any files
+if [ -z "$(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events* 2>/dev/null || true)" ]; then
+    echo "Error: No tensorboard event files found matching the pattern."
+    exit 1
+elif [[ -f $OUTPUT_TAR ]]; then
+    echo "Error: $OUTPUT_TAR already exists. Clean it up before continuing."
+    exit 1
+fi
 
-# Create a temporary directory
 TMP_DIR=$(mktemp -d)
 echo "Created temporary directory: $TMP_DIR"
 
+# Set up trap to clean up temporary directory on exit
+trap "echo 'Cleaning up temporary directory $TMP_DIR'; rm -rf $TMP_DIR" EXIT
+
 # Loop over all the recipe runs and package them into a tarball
 for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do
     exp_name=$(basename -- $(cut -d/ -f3 <<<$tbevent) -logs)
@@ -32,10 +47,5 @@ for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do
 done
 
 # Create a tarball of all the processed event files
-OUTPUT_TAR="release_runs-$(git rev-parse --short HEAD).tar.gz"
 tar -czf "$OUTPUT_TAR" -C "$TMP_DIR" .
 echo "Created tarball: $OUTPUT_TAR"
-
-# Clean up the temporary directory
-rm -rf "$TMP_DIR"
-echo "Cleaned up temporary directory $TMP_DIR"

From 36506abc0485550a82137ae7f8908d86114ebeed Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Sun, 27 Apr 2025 23:19:46 -0700
Subject: [PATCH 15/19] another fix

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tools/package_release_runs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/package_release_runs.sh b/tools/package_release_runs.sh
index bf8281d154..b2570c1af5 100755
--- a/tools/package_release_runs.sh
+++ b/tools/package_release_runs.sh
@@ -32,7 +32,7 @@ trap "echo 'Cleaning up temporary directory $TMP_DIR'; rm -rf $TMP_DIR" EXIT
 
 # Loop over all the recipe runs and package them into a tarball
 for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do
-    exp_name=$(basename -- $(cut -d/ -f3 <<<$tbevent) -logs)
+    exp_name=$(basename -- $(cut -d/ -f2 <<<$tbevent) -logs)
     # Obfuscate the hostname
     # events.out.tfevents.1744822578.<host-name>.780899.0
     obfuscated_event_path=$(basename $tbevent | awk -F. '{print $1"."$2"."$3"."$4".HOSTNAME."$(NF-1)"."$NF}')

From bd352faf1f019753f8c88a7f9433e49e455280a6 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Sun, 27 Apr 2025 23:26:18 -0700
Subject: [PATCH 16/19] final fix

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tools/package_release_runs.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/package_release_runs.sh b/tools/package_release_runs.sh
index b2570c1af5..357c9ad618 100755
--- a/tools/package_release_runs.sh
+++ b/tools/package_release_runs.sh
@@ -15,8 +15,10 @@ shopt -s globstar
 
 OUTPUT_TAR="release_runs-$(git rev-parse --short HEAD).tar.gz"
 
+TB_EVENTS=$(ls code_snapshots/*/tests/test_suites/**/logs/*/tensorboard/events* || true)
+
 # Check if the glob expanded to any files
-if [ -z "$(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events* 2>/dev/null || true)" ]; then
+if [ -z "$TB_EVENTS" ]; then
     echo "Error: No tensorboard event files found matching the pattern."
     exit 1
 elif [[ -f $OUTPUT_TAR ]]; then
@@ -31,7 +33,7 @@ echo "Created temporary directory: $TMP_DIR"
 trap "echo 'Cleaning up temporary directory $TMP_DIR'; rm -rf $TMP_DIR" EXIT
 
 # Loop over all the recipe runs and package them into a tarball
-for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do
+for tbevent in $TB_EVENTS; do
     exp_name=$(basename -- $(cut -d/ -f2 <<<$tbevent) -logs)
     # Obfuscate the hostname
     # events.out.tfevents.1744822578.<host-name>.780899.0

From b0147def9fb95f16ee34312289f7348d000cbc79 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 28 Apr 2025 12:20:10 -0700
Subject: [PATCH 17/19] fix renaming missed

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 docker/Dockerfile                                               | 2 +-
 .../llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh          | 2 +-
 .../test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh | 2 +-
 .../test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index b1977a4ac9..2baf5d4ea3 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -17,7 +17,7 @@ RUN chmod 755 /home/ray/.cache
 
 FROM base AS hermetic
 
-WORKDIR /opt/reinforcer
+WORKDIR /opt/nemo-rl
 
 # First copy only the dependency files
 COPY --chown=ray --chmod=755 pyproject.toml uv.lock ./
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
index 1f937018a3..32bb6dacb7 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh
@@ -32,7 +32,7 @@ uv run examples/run_sft.py \
 # Convert tensorboard logs to json
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/reinforcer/issues/263
+# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/nemo-rl/issues/263
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     # TODO: FIGURE OUT CORRECT METRICS
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
index 2379681138..ac441240fc 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh
@@ -31,7 +31,7 @@ uv run examples/run_sft.py \
 # Convert tensorboard logs to json
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/reinforcer/issues/263
+# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/nemo-rl/issues/263
 
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
index fd40a85764..9fb5f7839b 100755
--- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
+++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh
@@ -3,7 +3,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
 
 # TODO: this config can crash on OOM
-# https://github.com/NVIDIA/reinforcer/issues/263
+# https://github.com/NVIDIA/nemo-rl/issues/263
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=4

From 0f8f5e86575548723b7cd083243ad5cd5715702e Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 28 Apr 2025 14:54:38 -0700
Subject: [PATCH 18/19] increase the test time a little and time the functional

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .github/workflows/cicd-main.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index e0f0a6532b..c38cc2dd87 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -150,7 +150,7 @@ jobs:
     if: ${{ needs.pre-flight.outputs.test_level != 'none' }}
     with:
       RUNNER: self-hosted-azure
-      TIMEOUT: 60
+      TIMEOUT: 75
       UNIT_TEST_SCRIPT: |
         cd /opt/nemo-rl
         if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then
@@ -168,10 +168,10 @@ jobs:
       FUNCTIONAL_TEST_SCRIPT: |
         cd /opt/nemo-rl
         if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L1|L2)$ ]]; then
-          uv run --no-sync bash ./tests/functional/sft.sh
-          uv run --no-sync bash ./tests/functional/grpo.sh
-          uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
-          uv run --no-sync bash ./tests/functional/dpo.sh
+          time uv run --no-sync bash ./tests/functional/sft.sh
+          time uv run --no-sync bash ./tests/functional/grpo.sh
+          time uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
+          time uv run --no-sync bash ./tests/functional/dpo.sh
         else
           echo Skipping functional tests for level ${{ needs.pre-flight.outputs.test_level }}
         fi

From 139c4cf79e2a61a074942c0e1bfea23347558168 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 28 Apr 2025 14:58:05 -0700
Subject: [PATCH 19/19] fix tests

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 tests/functional/dpo.sh            |  6 +++---
 tests/functional/grpo.sh           |  6 +++---
 tests/functional/grpo_multiturn.sh | 19 ++++++++++---------
 tests/functional/sft.sh            |  6 +++---
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh
index e719f84b79..200a08cdd7 100755
--- a/tests/functional/dpo.sh
+++ b/tests/functional/dpo.sh
@@ -19,7 +19,7 @@ rm -rf $EXP_DIR $LOG_DIR
 mkdir -p $EXP_DIR $LOG_DIR
 
 cd $PROJECT_ROOT
-python -u $PROJECT_ROOT/examples/run_dpo.py \
+uv run $PROJECT_ROOT/examples/run_dpo.py \
     cluster.gpus_per_node=2 \
     dpo.max_num_steps=3 \
     dpo.val_batches=1 \
@@ -32,8 +32,8 @@ python -u $PROJECT_ROOT/examples/run_dpo.py \
     $@ \
     2>&1 | tee $RUN_LOG
 
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-python -u tests/check_metrics.py $JSON_METRICS \
+uv run tests/check_metrics.py $JSON_METRICS \
   'data["train/loss"]["2"] < 0.694' \
 
diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh
index 93b4ec25e1..bbbbd44a11 100755
--- a/tests/functional/grpo.sh
+++ b/tests/functional/grpo.sh
@@ -19,7 +19,7 @@ rm -rf $EXP_DIR $LOG_DIR
 mkdir -p $EXP_DIR $LOG_DIR
 
 cd $PROJECT_ROOT
-python -u $PROJECT_ROOT/examples/run_grpo_math.py \
+uv run $PROJECT_ROOT/examples/run_grpo_math.py \
     cluster.gpus_per_node=2 \
     grpo.max_num_steps=3 \
     logger.tensorboard_enabled=true \
@@ -29,8 +29,8 @@ python -u $PROJECT_ROOT/examples/run_grpo_math.py \
     $@ \
     2>&1 | tee $RUN_LOG
 
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-python -u tests/check_metrics.py $JSON_METRICS \
+uv run tests/check_metrics.py $JSON_METRICS \
     'max(data["train/token_mult_prob_error"]) < 1.1' \
 
diff --git a/tests/functional/grpo_multiturn.sh b/tests/functional/grpo_multiturn.sh
index ff9befcdd7..a22153c729 100755
--- a/tests/functional/grpo_multiturn.sh
+++ b/tests/functional/grpo_multiturn.sh
@@ -7,17 +7,19 @@ git config --global --add safe.directory $PROJECT_ROOT
 
 set -eou pipefail
 
-LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
-JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
-RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
+EXP_NAME=$(basename $0 .sh)
+EXP_DIR=$SCRIPT_DIR/$EXP_NAME
+LOG_DIR=$EXP_DIR/logs
+JSON_METRICS=$EXP_DIR/metrics.json
+RUN_LOG=$EXP_DIR/run.log
 export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
 export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
 
-rm -rf $LOG_DIR
-mkdir -p $LOG_DIR
+rm -rf $EXP_DIR $LOG_DIR
+mkdir -p $EXP_DIR $LOG_DIR
 
 cd $PROJECT_ROOT
-python -u $PROJECT_ROOT/examples/run_grpo_sliding_puzzle.py \
+uv run $PROJECT_ROOT/examples/run_grpo_sliding_puzzle.py \
     cluster.gpus_per_node=2 \
     grpo.max_rollout_turns=10 \
     grpo.max_num_steps=3 \
@@ -32,9 +34,8 @@ python -u $PROJECT_ROOT/examples/run_grpo_sliding_puzzle.py \
     $@ \
     2>&1 | tee $RUN_LOG
 
-cd $SCRIPT_DIR
-python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-python check_metrics.py $JSON_METRICS \
+uv run tests/check_metrics.py $JSON_METRICS \
     'max(data["train/token_mult_prob_error"]) < 1.1' \
 
diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh
index 812733338c..90985ae2c1 100755
--- a/tests/functional/sft.sh
+++ b/tests/functional/sft.sh
@@ -22,7 +22,7 @@ rm -rf $EXP_DIR $LOG_DIR
 mkdir -p $EXP_DIR $LOG_DIR
 
 cd $PROJECT_ROOT
-python -u $PROJECT_ROOT/examples/run_sft.py \
+uv run $PROJECT_ROOT/examples/run_sft.py \
     policy.model_name=meta-llama/Llama-3.2-1B \
     cluster.gpus_per_node=2 \
     sft.max_num_steps=10 \
@@ -36,9 +36,9 @@ python -u $PROJECT_ROOT/examples/run_sft.py \
     $@ \
     2>&1 | tee $RUN_LOG
 
-python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 # TODO: loss is very noisy, this check is mainly for sanity of immediate divergence
-python -u tests/check_metrics.py $JSON_METRICS \
+uv run tests/check_metrics.py $JSON_METRICS \
   'data["train/loss"]["9"] < 1500' \