From bc48e8465eba0e179997236b6890353548737d16 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 16 Apr 2025 15:28:34 -0700 Subject: [PATCH 01/19] feat: release convergence runs Signed-off-by: Terry Kong add qwen Signed-off-by: Terry Kong llama3.2 1bb recipe Signed-off-by: Terry Kong fsdp2 tests Signed-off-by: Terry Kong NUM_HOURS Signed-off-by: Terry Kong add more tests for sft dtensor Signed-off-by: Terry Kong move things around Signed-off-by: Terry Kong fix Signed-off-by: Terry Kong update Signed-off-by: Terry Kong correct the paths Signed-off-by: Terry Kong missing copyright Signed-off-by: Terry Kong add code snapshot and coninueing ability Signed-off-by: Terry Kong moving things around Signed-off-by: Terry Kong performance directory Signed-off-by: Terry Kong misplaced .gitkeep Signed-off-by: Terry Kong recursive glob Signed-off-by: Terry Kong fix Signed-off-by: Terry Kong move to minutes which is better granularity for the perf team Signed-off-by: Terry Kong update hermetic Signed-off-by: Terry Kong move things around Signed-off-by: Terry Kong fix up Signed-off-by: Terry Kong readme cleanup Signed-off-by: Terry Kong add model names to guard from model names changing from the default configs Signed-off-by: Terry Kong cleanup Signed-off-by: Terry Kong test cases Signed-off-by: Terry Kong missing files Signed-off-by: Terry Kong fix all tests Signed-off-by: Terry Kong incorporated everyone's feedback Signed-off-by: Terry Kong make sure hfhome/cache are propagated Signed-off-by: Terry Kong project_root correction Signed-off-by: Terry Kong launch MOUNTS typo Signed-off-by: Terry Kong docs Signed-off-by: Terry Kong copyright Signed-off-by: Terry Kong typo Signed-off-by: Terry Kong clean up docs Signed-off-by: Terry Kong wip Signed-off-by: Terry Kong fix the mount Signed-off-by: Terry Kong fix all 70b -> 32b tests Signed-off-by: Terry Kong get all the test step times down Signed-off-by: Terry Kong --- .gitignore | 2 + nemo_rl/__init__.py | 13 ++ recipes/README.md | 67 ++++++ ...llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh | 65 ++++++ ...grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh | 65 ++++++ ...en2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh | 69 ++++++ ...po-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh | 69 ++++++ .../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh | 64 ++++++ ...rpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh | 67 ++++++ ...wen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh | 65 ++++++ recipes/llm/performance/.gitkeep | 0 .../sft-llama3.1-8b-instruct-1n8g-fsdp1.sh | 66 ++++++ ...llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh | 68 ++++++ ...ft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh | 68 ++++++ recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh | 66 ++++++ ...sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh | 70 ++++++ tests/README.md | 20 ++ tests/{functional => }/check_metrics.py | 0 tests/functional/dpo.sh | 18 +- tests/functional/grpo.sh | 19 +- tests/functional/sft.sh | 21 +- tests/{functional => }/json_dump_tb_logs.py | 0 tests/test_suites/nightly.txt | 28 +++ tests/test_suites/nightly_performance.txt | 0 tests/test_suites/release.txt | 16 ++ tests/test_suites/release_performance.txt | 0 tests/unit/test_recipes_and_test_suites.py | 200 ++++++++++++++++++ tools/autoformat.sh | 0 tools/code_snapshot.sh | 40 ++++ tools/launch | 175 +++++++++++++++ tools/package_release_runs.sh | 41 ++++ 31 files changed, 1434 insertions(+), 28 deletions(-) create mode 100644 recipes/README.md create mode 100755 recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh create mode 100755 recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh create mode 100755 recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh create mode 100755 recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh create mode 100755 recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh create mode 100755 recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh create mode 100755 recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh create mode 100644 recipes/llm/performance/.gitkeep create mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh create mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh create mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh create mode 100755 recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh create mode 100755 recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh create mode 100644 tests/README.md rename tests/{functional => }/check_metrics.py (100%) rename tests/{functional => }/json_dump_tb_logs.py (100%) create mode 100644 tests/test_suites/nightly.txt create mode 100644 tests/test_suites/nightly_performance.txt create mode 100644 tests/test_suites/release.txt create mode 100644 tests/test_suites/release_performance.txt create mode 100644 tests/unit/test_recipes_and_test_suites.py mode change 100644 => 100755 tools/autoformat.sh create mode 100644 tools/code_snapshot.sh create mode 100755 tools/launch create mode 100755 tools/package_release_runs.sh diff --git a/.gitignore b/.gitignore index 478990ddc8..12121a4155 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,8 @@ apidocs/ dist/ *.egg-info/ *.vscode/ +release_run* +ckpts/ # Test coverage.json diff --git a/nemo_rl/__init__.py b/nemo_rl/__init__.py index 1606956b87..c755e5ed0f 100644 --- a/nemo_rl/__init__.py +++ b/nemo_rl/__init__.py @@ -1,3 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os from nemo_rl.package_info import ( __contact_emails__, diff --git a/recipes/README.md b/recipes/README.md new file mode 100644 index 0000000000..3ccf0d75c9 --- /dev/null +++ b/recipes/README.md @@ -0,0 +1,67 @@ +# Recipes + +## Naming + +Each test is named: +``` +--#n#g--.sh +``` + +Examples: +* sft-llama3.2-1b-1n8g-fsdp2tp1.sh +* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2.sh +* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.sh + +## Running manually + +Each recipe can be run on the head node: + +```sh +uv run ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh +``` + +and the result directory can be found at the same level of the script (w/o `.sh` prefix): + +```sh +ls -lh llm/sft-llama3.2-1b-1n8g-fsdp2tp1/ +# drwxr-xr-x 2 terryk dip 4.0K Apr 23 18:07 ckpts +# drwxr-xr-x 3 terryk dip 4.0K Apr 23 18:07 logs +# -rw-r--r-- 1 terryk dip 142K Apr 23 18:23 metrics.json +# -rw-r--r-- 1 terryk dip 94K Apr 23 18:23 run.log +``` + +## Launching with code snapshots + +We provide a convenience script that will create a code snapshot and launch `NUM_RUNS` number of slurm jobs (`NUM_RUNS` is defined in the script itself). We create a code snapshot to +ensure that even as the master repo changes its code, you can always run your experiment with +the snapshot of the code at the time the experiment was initially launched. + +```sh +# Launch +CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh + +# Prints Estimated GPUhrs and then exits +DRYRUN=1 CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh + +# Prints Estimated GPUhrs, creates code snapshot, then exits +DRYRUN=2 CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh +``` + +After this completes, you can find the result under + +```sh +ls -lh ../code_snapshots/sft-llama3.2-1b-1n8g-fsdp2tp1/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1/ +# drwxr-xr-x 2 terryk dip 4.0K Apr 23 18:07 ckpts +# drwxr-xr-x 3 terryk dip 4.0K Apr 23 18:07 logs +# -rw-r--r-- 1 terryk dip 142K Apr 23 18:23 metrics.json +# -rw-r--r-- 1 terryk dip 94K Apr 23 18:23 run.log +``` + +As a convenience, there's also a `continue.sh` script under that will launch +another run using the same arguments. This is helpful if your job was +unexpectedly cancelled or you want to run it for a little longer. + +```sh +# This launches one more run of the same experiment +../code_snapshots/sft-llama3.2-1b-1n8g-fsdp2tp1/continue.sh +``` diff --git a/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh new file mode 100755 index 0000000000..3feb431a2f --- /dev/null +++ b/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=100 +MAX_STEPS=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_grpo_math.py \ + --config examples/configs/grpo_math_8B.yaml \ + policy.model_name=meta-llama/Llama-3.1-8B-Instruct \ + policy.dtensor_cfg.enabled=true \ + policy.dtensor_cfg.tensor_parallel_size=1 \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + python -u tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["100"] < 1.1' +fi + diff --git a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..14df2cfe5f --- /dev/null +++ b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=500 +MAX_STEPS=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_grpo_math.py \ + --config examples/configs/grpo_math_1B.yaml \ + policy.model_name=meta-llama/Llama-3.2-1B-Instruct \ + policy.dtensor_cfg.enabled=true \ + policy.dtensor_cfg.tensor_parallel_size=1 \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + python -u tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["100"] < 1.1' +fi + diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh new file mode 100755 index 0000000000..04a380c746 --- /dev/null +++ b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=16 +STEPS_PER_RUN=10 +MAX_STEPS=20 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_grpo_math.py \ + --config examples/configs/grpo_math_8B.yaml \ + policy.model_name=Qwen/Qwen2.5-32B \ + policy.dtensor_cfg.enabled=True \ + policy.dtensor_cfg.tensor_parallel_size=8 \ + policy.dtensor_cfg.sequence_parallel=True \ + policy.dtensor_cfg.activation_checkpointing=True \ + policy.generation.vllm_cfg.tensor_parallel_size=4 \ + policy.max_total_sequence_length=16384 \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + python -u tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["100"] < 1.1' +fi + diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh new file mode 100755 index 0000000000..466b1a41ec --- /dev/null +++ b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=16 +STEPS_PER_RUN=2 # 40min: step_time: [1341, 801] +MAX_STEPS=2 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=60 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_grpo_math.py \ + --config examples/configs/grpo_math_8B.yaml \ + policy.model_name=Qwen/Qwen2.5-32B \ + policy.dtensor_cfg.enabled=True \ + policy.dtensor_cfg.tensor_parallel_size=8 \ + policy.dtensor_cfg.sequence_parallel=True \ + policy.dtensor_cfg.activation_checkpointing=True \ + policy.generation.vllm_cfg.tensor_parallel_size=4 \ + policy.max_total_sequence_length=16384 \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + python -u tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["160"] < 1.1' +fi + diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh new file mode 100755 index 0000000000..cb8f5c9bca --- /dev/null +++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh @@ -0,0 +1,64 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=30 +MAX_STEPS=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=60 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_grpo_math.py \ + --config examples/configs/grpo_math_8B.yaml \ + policy.model_name=Qwen/Qwen2.5-7B-Instruct \ + policy.dtensor_cfg.enabled=false \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + python -u tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["160"] < 1.1' +fi + diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh new file mode 100755 index 0000000000..a4037b01d7 --- /dev/null +++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=30 +MAX_STEPS=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=180 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_grpo_math.py \ + --config examples/configs/grpo_math_8B.yaml \ + policy.model_name=Qwen/Qwen2.5-7B-Instruct \ + policy.dtensor_cfg.enabled=True \ + policy.dtensor_cfg.tensor_parallel_size=4 \ + policy.dtensor_cfg.sequence_parallel=True \ + policy.generation.vllm_cfg.tensor_parallel_size=4 \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + python -u tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["160"] < 1.1' +fi + diff --git a/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..89bf673983 --- /dev/null +++ b/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=450 +MAX_STEPS=450 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_grpo_math.py \ + --config examples/configs/grpo_math_1B.yaml \ + policy.model_name=Qwen/Qwen2.5-Math-1.5B-Instruct \ + policy.dtensor_cfg.enabled=true \ + policy.dtensor_cfg.tensor_parallel_size=1 \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + grpo.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + python -u tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["100"] < 1.1' +fi + diff --git a/recipes/llm/performance/.gitkeep b/recipes/llm/performance/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh new file mode 100755 index 0000000000..70f834e0a8 --- /dev/null +++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh @@ -0,0 +1,66 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=500 +MAX_STEPS=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=15 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_sft.py \ + --config examples/configs/sft.yaml \ + policy.model_name=meta-llama/Llama-3.1-8B-Instruct \ + policy.precision=bfloat16 \ + policy.dtensor_cfg.enabled=False \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + python -u tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 2.4' \ + 'data["train/loss"]["60"] < 0.45' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 30000' +fi diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh new file mode 100755 index 0000000000..1402a094dc --- /dev/null +++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -eou pipefail + +# TODO: @ashors real convergence run +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=12000 +MAX_STEPS=12000 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_sft.py \ + --config examples/configs/sft.yaml \ + policy.model_name=meta-llama/Llama-3.1-8B-Instruct \ + policy.precision=bfloat16 \ + policy.dtensor_cfg.enabled=True \ + policy.dtensor_cfg.tensor_parallel_size=1 \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + python -u tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 2.4' \ + 'data["train/loss"]["60"] < 0.45' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 30000' +fi diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh new file mode 100755 index 0000000000..3f5ce413eb --- /dev/null +++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=350 +MAX_STEPS=350 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=30 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_sft.py \ + --config examples/configs/sft.yaml \ + policy.model_name=meta-llama/Llama-3.1-8B-Instruct \ + policy.precision=bfloat16 \ + policy.dtensor_cfg.enabled=True \ + policy.dtensor_cfg.tensor_parallel_size=2 \ + policy.dtensor_cfg.sequence_parallel=True \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + python -u tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 2.4' \ + 'data["train/loss"]["60"] < 0.45' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 30000' +fi diff --git a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..2c6f930399 --- /dev/null +++ b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh @@ -0,0 +1,66 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=1000 +MAX_STEPS=1000 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=15 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_sft.py \ + --config examples/configs/sft.yaml \ + policy.model_name=meta-llama/Llama-3.2-1B \ + policy.dtensor_cfg.enabled=True \ + policy.dtensor_cfg.tensor_parallel_size=1 \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + python -u tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 2.4' \ + 'data["train/loss"]["60"] < 0.45' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 30000' +fi + diff --git a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh new file mode 100755 index 0000000000..ad23383dc9 --- /dev/null +++ b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -eou pipefail + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=20 # step_time ~ 29sec +MAX_STEPS=20 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=30 +# ===== END CONFIG ===== + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT + +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +# Early stopping to save compute if max step has been reached +STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) +if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 +fi +echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" + +# Run the experiment +cd $PROJECT_ROOT +python -u examples/run_sft.py \ + --config examples/configs/sft.yaml \ + policy.model_name=Qwen/Qwen2.5-32B \ + policy.precision=bfloat16 \ + policy.dtensor_cfg.enabled=True \ + policy.dtensor_cfg.tensor_parallel_size=8 \ + policy.dtensor_cfg.sequence_parallel=True \ + policy.dtensor_cfg.activation_checkpointing=True \ + policy.max_total_sequence_length=16000 \ + cluster.num_nodes=$NUM_NODES \ + cluster.gpus_per_node=8 \ + sft.max_num_steps=$MAX_STEPS \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + python -u tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 2.4' \ + 'data["train/loss"]["60"] < 0.45' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 30000' +fi diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000000..4e51a6efad --- /dev/null +++ b/tests/README.md @@ -0,0 +1,20 @@ +# Tests + +## Launching Release Tests + +```sh +# Assuming in NeMo RL project root + +cd tools/ + +IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch ... + +# DRYRUN=1 to get a rough estimate of compute +DRYRUN=1 IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch ... + +# DRYRUN=2 will create a codesnapshot with a fully hermetic example +DRYRUN=2 IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch ... + +# Run all (Caution: this will use a lot of compute; consider listing out the jobs) +IS_RELEASE=1 CONTAINER=... ACCOUNT=... PARTITION=... ./launch ../../recipes/**/*.sh +``` diff --git a/tests/functional/check_metrics.py b/tests/check_metrics.py similarity index 100% rename from tests/functional/check_metrics.py rename to tests/check_metrics.py diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh index 2421c5da6a..e719f84b79 100755 --- a/tests/functional/dpo.sh +++ b/tests/functional/dpo.sh @@ -7,15 +7,16 @@ git config --global --add safe.directory $PROJECT_ROOT set -eou pipefail -LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs -JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json -RUN_LOG=$LOG_DIR/$(basename $0 .sh).log -export RAY_DEDUP_LOGS=0 +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -rm -rf $LOG_DIR -mkdir -p $LOG_DIR +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT python -u $PROJECT_ROOT/examples/run_dpo.py \ @@ -31,9 +32,8 @@ python -u $PROJECT_ROOT/examples/run_dpo.py \ $@ \ 2>&1 | tee $RUN_LOG -cd $SCRIPT_DIR -python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -python check_metrics.py $JSON_METRICS \ +python -u tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["2"] < 0.694' \ diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh index b61442227b..93b4ec25e1 100755 --- a/tests/functional/grpo.sh +++ b/tests/functional/grpo.sh @@ -2,19 +2,21 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo +# Mark the current repo as safe, since wandb fetches metadata about the repo git config --global --add safe.directory $PROJECT_ROOT set -eou pipefail -LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs -JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json -RUN_LOG=$LOG_DIR/$(basename $0 .sh).log +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -rm -rf $LOG_DIR -mkdir -p $LOG_DIR +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT python -u $PROJECT_ROOT/examples/run_grpo_math.py \ @@ -27,9 +29,8 @@ python -u $PROJECT_ROOT/examples/run_grpo_math.py \ $@ \ 2>&1 | tee $RUN_LOG -cd $SCRIPT_DIR -python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -python check_metrics.py $JSON_METRICS \ +python -u tests/check_metrics.py $JSON_METRICS \ 'max(data["train/token_mult_prob_error"]) < 1.1' \ diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh index f3474fb0fd..812733338c 100755 --- a/tests/functional/sft.sh +++ b/tests/functional/sft.sh @@ -1,23 +1,25 @@ #!/bin/bash -## clean up checkpoint directory on exit +# clean up checkpoint directory on exit trap "rm -rf /tmp/sft_checkpoints" EXIT SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo +# Mark the current repo as safe, since wandb fetches metadata about the repo git config --global --add safe.directory $PROJECT_ROOT set -eou pipefail -LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs -JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json -RUN_LOG=$LOG_DIR/$(basename $0 .sh).log +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -rm -rf $LOG_DIR -mkdir -p $LOG_DIR +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT python -u $PROJECT_ROOT/examples/run_sft.py \ @@ -34,10 +36,9 @@ python -u $PROJECT_ROOT/examples/run_sft.py \ $@ \ 2>&1 | tee $RUN_LOG -cd $SCRIPT_DIR -python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # TODO: loss is very noisy, this check is mainly for sanity of immediate divergence -python check_metrics.py $JSON_METRICS \ +python -u tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["9"] < 1500' \ diff --git a/tests/functional/json_dump_tb_logs.py b/tests/json_dump_tb_logs.py similarity index 100% rename from tests/functional/json_dump_tb_logs.py rename to tests/json_dump_tb_logs.py diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt new file mode 100644 index 0000000000..9b4eac9491 --- /dev/null +++ b/tests/test_suites/nightly.txt @@ -0,0 +1,28 @@ +######## +# GRPO # +######## + +# Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much +recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh +recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh + +# FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct) +recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh +recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh + +# Functional 32b run +recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh + +####### +# SFT # +####### + +# 1N 1B/8B runs +recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh + +# Dtensor vs fsdp1 (8B) +recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh +recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh + +# Functional 32b test +recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh diff --git a/tests/test_suites/nightly_performance.txt b/tests/test_suites/nightly_performance.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt new file mode 100644 index 0000000000..dfc997435b --- /dev/null +++ b/tests/test_suites/release.txt @@ -0,0 +1,16 @@ +######## +# GRPO # +######## + +# Long 8b run +recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh + +# Long 32b run +recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh + +####### +# SFT # +####### + +# Long 8b convergence +recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh \ No newline at end of file diff --git a/tests/test_suites/release_performance.txt b/tests/test_suites/release_performance.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py new file mode 100644 index 0000000000..100486ddf4 --- /dev/null +++ b/tests/unit/test_recipes_and_test_suites.py @@ -0,0 +1,200 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import os +import glob +import subprocess + +dir_path = os.path.dirname(os.path.abspath(__file__)) +project_root = os.path.abspath(os.path.join(dir_path, "..", "..")) +recipes_dir = os.path.join(project_root, "recipes") + +test_suites_dir = os.path.join(project_root, "tests", "test_suites") + +nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt") +release_test_suite_path = os.path.join(test_suites_dir, "release.txt") +nightly_performance_test_suite_path = os.path.join( + test_suites_dir, "nightly_performance.txt" +) +release_performance_test_suite_path = os.path.join( + test_suites_dir, "release_performance.txt" +) + + +@pytest.fixture +def nightly_test_suite(): + nightly_suite = [] + with open(nightly_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + nightly_suite.append(line) + return nightly_suite + + +@pytest.fixture +def release_test_suite(): + release_suite = [] + with open(release_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + release_suite.append(line) + return release_suite + + +@pytest.fixture +def nightly_performance_test_suite(): + nightly_performance_suite = [] + with open(nightly_performance_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + nightly_performance_suite.append(line) + return nightly_performance_suite + + +@pytest.fixture +def release_performance_test_suite(): + release_performance_suite = [] + with open(release_performance_test_suite_path, "r") as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + release_performance_suite.append(line) + return release_performance_suite + + +@pytest.fixture +def all_test_suites( + nightly_test_suite, + release_test_suite, + nightly_performance_test_suite, + release_performance_test_suite, +): + return ( + nightly_test_suite + + release_test_suite + + nightly_performance_test_suite + + release_performance_test_suite + ) + + +@pytest.mark.parametrize( + "test_suite_path", + [ + nightly_test_suite_path, + release_test_suite_path, + nightly_performance_test_suite_path, + release_performance_test_suite_path, + ], + ids=[ + "nightly_test_suite", + "release_test_suite", + "nightly_performance_test_suite", + "release_performance_test_suite", + ], +) +def test_test_suites_exist(test_suite_path): + assert os.path.exists(test_suite_path), ( + f"Test suite {test_suite_path} does not exist" + ) + + +def test_no_overlap_across_test_suites(all_test_suites): + recipes = set(all_test_suites) + assert len(recipes) == len(all_test_suites), f"Test suites have repeats {recipes}" + + +def test_all_recipes_accounted_for_in_test_suites(all_test_suites): + all_recipes_in_test_suites = set(all_test_suites) + + all_recipes_in_recipes_dir = set() + for recipe_path in glob.glob( + os.path.join(recipes_dir, "**", "*.sh"), recursive=True + ): + # Strip off the project root and leading slash + recipe_name = recipe_path[len(project_root) + 1 :] + all_recipes_in_recipes_dir.add(recipe_name) + + assert all_recipes_in_test_suites == all_recipes_in_recipes_dir, ( + "All recipes are not accounted for in the test suites" + ) + + +def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker): + command = f"DRYRUN=1 CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}" + + print(f"Running command: {command}") + + # Run the command from the project root directory + result = subprocess.run( + command, + shell=True, + cwd=project_root, + capture_output=True, + text=True, + check=False, # Don't raise exception on non-zero exit code + ) + + # Print stdout and stderr for debugging if the test fails + print("STDOUT:") + print(result.stdout) + print("STDERR:") + print(result.stderr) + + # Assert that the command exited successfully + assert result.returncode == 0, f"Command failed with exit code {result.returncode}" + + # Assert that the last line of stdout contains the expected prefix + stdout_lines = result.stdout.strip().splitlines() + assert len(stdout_lines) > 0, "Command produced no output" + last_line = stdout_lines[-1] + assert last_line.startswith("[INFO]: Total GPU hours:"), ( + f"Last line of output was not as expected: '{last_line}'" + ) + total_gpu_hours = float(last_line.split(":")[-1].strip()) + assert total_gpu_hours <= 1024, f"Total GPU hours exceeded 1024: {last_line}" + tracker.track("total_nightly_gpu_hours", total_gpu_hours) + + +def test_dry_run_does_not_fail_and_prints_total_gpu_hours(): + command = "DRYRUN=1 CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./recipes/**/*.sh" + + # Run the command from the project root directory + result = subprocess.run( + command, + shell=True, + cwd=project_root, + capture_output=True, + text=True, + check=False, # Don't raise exception on non-zero exit code + ) + + # Print stdout and stderr for debugging if the test fails + print("STDOUT:") + print(result.stdout) + print("STDERR:") + print(result.stderr) + + # Assert that the command exited successfully + assert result.returncode == 0, f"Command failed with exit code {result.returncode}" + + # Assert that the last line of stdout contains the expected prefix + stdout_lines = result.stdout.strip().splitlines() + assert len(stdout_lines) > 0, "Command produced no output" + last_line = stdout_lines[-1] + assert last_line.startswith("[INFO]: Total GPU hours:"), ( + f"Last line of output was not as expected: '{last_line}'" + ) diff --git a/tools/autoformat.sh b/tools/autoformat.sh old mode 100644 new mode 100755 diff --git a/tools/code_snapshot.sh b/tools/code_snapshot.sh new file mode 100644 index 0000000000..62136a8632 --- /dev/null +++ b/tools/code_snapshot.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -euo pipefail + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +PROJECT_ROOT=${SCRIPT_DIR}/.. +cd ${PROJECT_ROOT} + +echo2() { + echo "$@" >&2 +} + +if [[ ! -e "$PROJECT_ROOT/.git" ]]; then + echo2 "[Error]: This script was not run from the root of NeMo RL git repo. Please clone it first." + exit 1 +elif [[ $# -lt 1 ]]; then + echo2 "[Error]: This script requires one argument: the name of the experiment to be used as the snapshot directory name" + echo2 "Usage: bash tools/code_snapshot.sh " + exit 1 +fi + +EXP_NAME=$1 + +SNAPSHOT_DIR="$PROJECT_ROOT/code_snapshots/${EXP_NAME}" +if [[ ! -d "$SNAPSHOT_DIR" ]]; then + echo2 "Creating new code snapshot in $SNAPSHOT_DIR" + mkdir -p $SNAPSHOT_DIR +else + echo2 "Using existing code snapshot in $SNAPSHOT_DIR" + # Echo the snapshot directory so the caller can use it to `cd` into it + echo ${SNAPSHOT_DIR} + exit +fi + +echo2 "Copying git-tracked files..." +rsync -a --files-from=<(git ls-files) ./ $SNAPSHOT_DIR/ + + +# Echo the snapshot directory so the caller can use it to `cd` into it +echo ${SNAPSHOT_DIR} \ No newline at end of file diff --git a/tools/launch b/tools/launch new file mode 100755 index 0000000000..1db03b3b03 --- /dev/null +++ b/tools/launch @@ -0,0 +1,175 @@ +#!/bin/bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +# This is a helper script to launch a release test on slurm. +# It reads a demarcated section of the script to extract the config, +# and uses that to determine how many nodes and how many chained jobs to launch. +# +# It also creates a code snapshot to ensure that the code is reproducible and subsequent +# jobs can be launched with the same code. It also creates a continue.sh in the code +# snapshot directory to continue launching the job even if the original invocation was +# forgotten. +# +# Usage: +# CONTAINER=... ACCOUNT=... PARTITION=... ./launch ... +# + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/..) + +# Function to extract config from a script +extract_config() { + local script_path="$1" + local config=$(sed -n '/^# =\+ BEGIN CONFIG =\+/,/^# =\+ END CONFIG =\+/p' "$script_path" | + grep -v "^#" | + grep "=" ) + if [[ -z "$config" ]]; then + echo "[ERROR]: No config section found in script_path=$script_path" + echo "[ERROR]: Please add and update a section in the script with these variables:" + echo + echo "# ===== BEGIN CONFIG =====" + echo "NUM_NODES=1" # How many nodes this job uses + echo "STEPS_PER_RUN=60" # Approximately how many steps reached in one job + echo "MAX_STEPS=60" # Max training steps + echo 'NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up' + echo "NUM_MINUTES=240" # How many minutes one job is (SLURM specific) + echo "# ===== END CONFIG =====" + return 1 + fi 1>&2 + echo "$config" +} + +check_file_in_version_control_and_get_relpath_from_git_root() { + local script_path="$1" + # Check if the script is tracked in git (assumes we're in the repo already) + rel_path_from_git_root=$(git ls-files --full-name --error-unmatch "$script_path") + ret_code=$? + if [[ $ret_code -ne 0 ]]; then + echo "[ERROR]: Script '$script_path' is not tracked in version control." >&2 + echo "[ERROR]: This may cause reproducibility issues. Add it to git to continue." >&2 + return 1 + fi + echo "$rel_path_from_git_root" +} + +set -eou pipefail + +if [[ $# -eq 0 ]]; then + echo "Error: No script provided." + echo "Usage: CONTAINER=... ACCOUNT=... PARTITION=... $0 ..." + exit 1 +fi + +# Check for mandatory environment variables +for VAR in "HF_HOME" "HF_DATASETS_CACHE"; do + if [[ -z "${!VAR:-}" ]]; then + echo "[ERROR]: $VAR environment variable is not set." + echo "[ERROR]: Please set $VAR to specify the appropriate Hugging Face directory." + echo "Example: export $VAR=/path/to/appropriate/directory" + exit 1 + fi +done + +CONTAINER=$CONTAINER +ACCOUNT=$ACCOUNT +PARTITION=$PARTITION +MOUNTS=${MOUNTS:-} +# DRYRUN=1 prints the runs and how much compute they use +# DRYRUN=2 additionally creates the snapshots (helpful to run a hermetic example manually or share a repro) +DRYRUN=${DRYRUN:-} +IS_RELEASE=${IS_RELEASE:-} # Adds extra configuration for wandb to track this in the right project +NOW=$(date '+%y%m%d-%H%M%S') + +if [[ -n "$MOUNTS" ]]; then + # Comma needed since we always mount PWD + MOUNTS=",$MOUNTS" +fi + +SCRIPTS="" +for SCRIPT in $@; do + if [[ ! -f "$SCRIPT" ]]; then + echo "Error: Script '$SCRIPT' does not exist or is not a file." + echo "Please provide a valid script path." + exit 1 + fi + SCRIPTS+=" $SCRIPT" +done + +total_gpu_hours=0 + +for SCRIPT in $SCRIPTS; do + # Extract and evaluate the config + if ! config=$(extract_config $SCRIPT); then + # Error message is already printed by extract_config + exit 1 + fi + eval "$config" + + # NUM_RUNS * NUM_NODES * NUM_GPUS * (NUM_MINUTES / 60) + gpu_hours=$((NUM_RUNS * NUM_NODES * 8 * NUM_MINUTES / 60)) + total_gpu_hours=$((total_gpu_hours + gpu_hours)) + echo "[INFO]: $gpu_hours GPUhrs to run $SCRIPT" + if [[ "${DRYRUN}" -eq 1 ]]; then + echo "[DRY_RUN]: Skipping creation of snapshot and submission of $SCRIPT." + continue + fi + + rel_script=$(check_file_in_version_control_and_get_relpath_from_git_root $SCRIPT) + + EXP_NAME=$(basename $SCRIPT .sh) + SNAPSHOT_DIR=$(bash $PROJECT_ROOT/tools/code_snapshot.sh $EXP_NAME) + + # Now use the variables + for i in $(seq 1 $NUM_RUNS); do + echo "Submitting $i/$NUM_RUNS job with ${NUM_NODES} nodes for $(basename $SCRIPT)" + JOB_NAME=$(basename $SCRIPT .sh) + + RELEASE_ARGS=() + if [[ -n "${IS_RELEASE}" ]]; then + RELEASE_ARGS=( + logger.wandb.project=nemo-rl-release + logger.wandb.name=$(basename $SCRIPT .sh)-$(git rev-parse --short HEAD) + ) + fi + + # TODO: jq install is just to be backward compatible with older containers. Should eventually remove. + cat <$SNAPSHOT_DIR/continue.sh +#!/bin/bash +SCRIPT_DIR=\$( cd -- "\$( dirname -- "\${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +cd \$SCRIPT_DIR + +HF_HOME=$HF_HOME \\ +HF_DATASETS_CACHE=$HF_DATASETS_CACHE \\ +COMMAND="apt install -y jq && uv run $rel_script ${RELEASE_ARGS[@]}" \\ +CONTAINER=$CONTAINER \\ +MOUNTS="$SNAPSHOT_DIR:$SNAPSHOT_DIR${MOUNTS}" \\ +sbatch \\ + --nodes=$NUM_NODES \\ + --account=$ACCOUNT \\ + --job-name=$ACCOUNT:$JOB_NAME \\ + --partition=$PARTITION \\ + --time=0:${NUM_MINUTES}:0 \\ + --gres=gpu:8 \\ + --output=slurm-${NOW}-%j-${JOB_NAME}-${i}.${NUM_RUNS}.out \\ + ray.sub +EOF + if [[ "${DRYRUN}" -eq 2 ]]; then + echo "[DRY_RUN]: Skipping submission of $SCRIPT. Find the snapshot at $SNAPSHOT_DIR and manually launch with 'bash continue.sh'" + else + bash $SNAPSHOT_DIR/continue.sh + fi + done +done +echo [INFO]: Total GPU hours: $total_gpu_hours diff --git a/tools/package_release_runs.sh b/tools/package_release_runs.sh new file mode 100755 index 0000000000..bf8aa1befc --- /dev/null +++ b/tools/package_release_runs.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# This script packages all release runs into a tarball with a git SHA so that we can upload to our +# release page. The SHA is to avoid conflicts with previous runs, but when we upload we should +# remove that so that users can expect that the name is release_runs.tar.gz (this renaming can be +# done in the Github Release UI). + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +PROJECT_ROOT=$(realpath $SCRIPT_DIR/..) +cd $PROJECT_ROOT + +set -eou pipefail + +# Create a temporary directory +TMP_DIR=$(mktemp -d) +echo "Created temporary directory: $TMP_DIR" + +# Loop over all the recipe runs and package them into a tarball +for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do + exp_name=$(basename -- $(cut -d/ -f3 <<<$tbevent) -logs) + # Obfuscate the hostname + # events.out.tfevents.1744822578..780899.0 + obfuscated_event_path=$(basename $tbevent | awk -F. '{print $1"."$2"."$3"."$4".HOSTNAME."$(NF-1)"."$NF}') + + # Create subdirectory for experiment if it doesn't exist + mkdir -p "$TMP_DIR/$exp_name" + + # Copy the event file with obfuscated name to the experiment subdirectory + cp "$tbevent" "$TMP_DIR/$exp_name/$obfuscated_event_path" + + echo "[$exp_name] Copied $tbevent to $TMP_DIR/$exp_name/$obfuscated_event_path" +done + +# Create a tarball of all the processed event files +OUTPUT_TAR="release_runs-$(git rev-parse --short HEAD).tar.gz" +tar -czf "$OUTPUT_TAR" -C "$TMP_DIR" . +echo "Created tarball: $OUTPUT_TAR" + +# Clean up the temporary directory +rm -rf "$TMP_DIR" +echo "Cleaned up temporary directory $TMP_DIR" From f3aea5b982174d68446b462e04a7152ed455b53e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 24 Apr 2025 12:21:46 -0700 Subject: [PATCH 02/19] fix unit tests Signed-off-by: Terry Kong --- tests/unit/test_recipes_and_test_suites.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 100486ddf4..244dde7b4e 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -134,7 +134,7 @@ def test_all_recipes_accounted_for_in_test_suites(all_test_suites): def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker): - command = f"DRYRUN=1 CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}" + command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}" print(f"Running command: {command}") @@ -170,7 +170,7 @@ def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker): def test_dry_run_does_not_fail_and_prints_total_gpu_hours(): - command = "DRYRUN=1 CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./recipes/**/*.sh" + command = "DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./recipes/**/*.sh" # Run the command from the project root directory result = subprocess.run( From da05a056056996926413625e79eade66d15e6eb7 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 24 Apr 2025 12:22:45 -0700 Subject: [PATCH 03/19] helpful msg Signed-off-by: Terry Kong --- tests/unit/test_recipes_and_test_suites.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 244dde7b4e..0214d1ca5e 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -165,7 +165,9 @@ def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker): f"Last line of output was not as expected: '{last_line}'" ) total_gpu_hours = float(last_line.split(":")[-1].strip()) - assert total_gpu_hours <= 1024, f"Total GPU hours exceeded 1024: {last_line}" + assert total_gpu_hours <= 1024, ( + f"Total GPU hours exceeded 1024: {last_line}. We should revisit the test suites to reduce the total GPU hours." + ) tracker.track("total_nightly_gpu_hours", total_gpu_hours) From 42aaaf25de1b9ee3761c74a3aae3575e289eccf4 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 24 Apr 2025 22:01:02 -0700 Subject: [PATCH 04/19] fix settings Signed-off-by: Terry Kong --- .../grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh | 2 +- .../grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh | 2 +- .../llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh | 2 +- .../grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh | 2 +- .../llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh | 4 ++-- ...sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh | 15 ++++++++------- .../sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh | 2 +- recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh | 8 ++++---- .../sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh | 10 ++++++---- 9 files changed, 25 insertions(+), 22 deletions(-) diff --git a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh index 14df2cfe5f..3989e10c51 100755 --- a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh +++ b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh @@ -60,6 +60,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then python -u tests/check_metrics.py $JSON_METRICS \ 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["100"] < 1.1' + 'data["train/token_mult_prob_error"]["500"] < 1.1' fi diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh index 466b1a41ec..0f7cf2ef76 100755 --- a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh +++ b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh @@ -64,6 +64,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then python -u tests/check_metrics.py $JSON_METRICS \ 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["160"] < 1.1' + 'data["train/token_mult_prob_error"]["2"] < 1.1' fi diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh index cb8f5c9bca..6509829b0f 100755 --- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh +++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh @@ -6,7 +6,7 @@ NUM_NODES=4 STEPS_PER_RUN=30 MAX_STEPS=30 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=60 +NUM_MINUTES=90 # ===== END CONFIG ===== SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh index a4037b01d7..6686df40a5 100755 --- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh +++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh @@ -62,6 +62,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then python -u tests/check_metrics.py $JSON_METRICS \ 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["160"] < 1.1' + 'data["train/token_mult_prob_error"]["30"] < 1.1' fi diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh index 70f834e0a8..f37dd89a1b 100755 --- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh +++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh @@ -6,12 +6,12 @@ NUM_NODES=1 STEPS_PER_RUN=500 MAX_STEPS=500 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=15 +NUM_MINUTES=30 # ===== END CONFIG ===== SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo +# Mark the current repo as safe, since wandb fetchs metadata about the repo/ git config --global --add safe.directory $PROJECT_ROOT EXP_NAME=$(basename $0 .sh) diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh index 1402a094dc..f2e2e9f2e0 100755 --- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh +++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh @@ -1,13 +1,13 @@ #!/bin/bash set -eou pipefail -# TODO: @ashors real convergence run +# TODO: @ashors real convergence run (dataset only has 2737) # ===== BEGIN CONFIG ===== NUM_NODES=1 -STEPS_PER_RUN=12000 -MAX_STEPS=12000 +STEPS_PER_RUN=2730 +MAX_STEPS=2730 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=240 +NUM_MINUTES=120 # ===== END CONFIG ===== SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) @@ -58,11 +58,12 @@ python -u examples/run_sft.py \ # Convert tensorboard logs to json python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/reinforcer/issues/263 # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then # TODO: FIGURE OUT CORRECT METRICS python -u tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 2.4' \ - 'data["train/loss"]["60"] < 0.45' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 30000' + 'data["train/loss"]["1"] < 5' \ + 'data["train/loss"]["2730"] < 0.3' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 45000' fi diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh index 3f5ce413eb..6421fed43f 100755 --- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh +++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh @@ -6,7 +6,7 @@ NUM_NODES=1 STEPS_PER_RUN=350 MAX_STEPS=350 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=30 +NUM_MINUTES=45 # ===== END CONFIG ===== SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) diff --git a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh index 2c6f930399..28028c1cdd 100755 --- a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh +++ b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh @@ -3,8 +3,8 @@ set -eou pipefail # ===== BEGIN CONFIG ===== NUM_NODES=1 -STEPS_PER_RUN=1000 -MAX_STEPS=1000 +STEPS_PER_RUN=500 +MAX_STEPS=500 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=15 # ===== END CONFIG ===== @@ -60,7 +60,7 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then python -u tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] < 2.4' \ - 'data["train/loss"]["60"] < 0.45' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 30000' + 'data["train/loss"]["500"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 25000' fi diff --git a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh index ad23383dc9..9a2ca8f19f 100755 --- a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh +++ b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh @@ -1,6 +1,9 @@ #!/bin/bash set -eou pipefail +# TODO: this config can crash on OOM +# https://github.com/NVIDIA/reinforcer/issues/263 + # ===== BEGIN CONFIG ===== NUM_NODES=4 STEPS_PER_RUN=20 # step_time ~ 29sec @@ -62,9 +65,8 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - # TODO: FIGURE OUT CORRECT METRICS python -u tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 2.4' \ - 'data["train/loss"]["60"] < 0.45' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 30000' + 'data["train/loss"]["1"] < 1.5' \ + 'data["train/loss"]["20"] < 0.3' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 35000' fi From b3a047195c2d979b1b163b05559326f275fd691c Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 00:26:54 -0700 Subject: [PATCH 05/19] fix step grpo-qwen2.5-7b-instruct-4n8g-fsdp1 Signed-off-by: Terry Kong --- recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh index 6509829b0f..22e69c307e 100755 --- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh +++ b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh @@ -59,6 +59,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then python -u tests/check_metrics.py $JSON_METRICS \ 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["160"] < 1.1' + 'data["train/token_mult_prob_error"]["30"] < 1.1' fi From f620160787b2a07dcc8bc01b548a6a677225f943 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 00:32:59 -0700 Subject: [PATCH 06/19] finalized sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp Signed-off-by: Terry Kong --- recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh index 6421fed43f..bcff7b5a38 100755 --- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh +++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh @@ -58,11 +58,13 @@ python -u examples/run_sft.py \ # Convert tensorboard logs to json python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/reinforcer/issues/263 + # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then # TODO: FIGURE OUT CORRECT METRICS python -u tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 2.4' \ - 'data["train/loss"]["60"] < 0.45' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 30000' + 'data["train/loss"]["1"] < 5' \ + 'data["train/loss"]["60"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 45000' fi From d5962e6770425b6c110f83fbe8a2b7b678ef0aa4 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 00:35:30 -0700 Subject: [PATCH 07/19] finalized grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long Signed-off-by: Terry Kong --- recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh index 04a380c746..0494e8a6d9 100755 --- a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh +++ b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh @@ -64,6 +64,6 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then python -u tests/check_metrics.py $JSON_METRICS \ 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["100"] < 1.1' + 'data["train/token_mult_prob_error"]["20"] < 1.1' fi From b8aa7f0df712c31e03008e098b2fcd76da951a9e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 09:02:03 -0700 Subject: [PATCH 08/19] fix sft-llama3.1-8b-instruct-1n8g-fsdp1 Signed-off-by: Terry Kong --- recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh index f37dd89a1b..a1c68aa28a 100755 --- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh +++ b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh @@ -3,8 +3,8 @@ set -eou pipefail # ===== BEGIN CONFIG ===== NUM_NODES=1 -STEPS_PER_RUN=500 -MAX_STEPS=500 +STEPS_PER_RUN=250 +MAX_STEPS=250 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=30 # ===== END CONFIG ===== @@ -60,7 +60,7 @@ python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then # TODO: FIGURE OUT CORRECT METRICS python -u tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 2.4' \ - 'data["train/loss"]["60"] < 0.45' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 30000' + 'data["train/loss"]["1"] < 4' \ + 'data["train/loss"]["250"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 60000' fi From f656e495e2b46d0d11b86b975776ae1727396d5c Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 15:58:31 -0700 Subject: [PATCH 09/19] uber refactor Signed-off-by: Terry Kong --- ...ama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml | 109 ++++++++++++++++++ ...po-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml | 109 ++++++++++++++++++ ...2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml | 109 ++++++++++++++++++ ...-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml | 109 ++++++++++++++++++ .../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml | 109 ++++++++++++++++++ ...o-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml | 109 ++++++++++++++++++ ...n2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml | 109 ++++++++++++++++++ .../sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml | 67 +++++++++++ ...ama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml | 67 +++++++++++ ...-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml | 67 +++++++++++ .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml | 67 +++++++++++ ...t-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml | 67 +++++++++++ ...llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh | 65 ----------- ...grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh | 65 ----------- ...en2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh | 69 ----------- ...po-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh | 69 ----------- .../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh | 64 ---------- ...rpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh | 67 ----------- ...wen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh | 65 ----------- .../sft-llama3.1-8b-instruct-1n8g-fsdp1.sh | 66 ----------- ...llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh | 69 ----------- ...ft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh | 70 ----------- recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh | 66 ----------- ...sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh | 72 ------------ {recipes => tests/test_suites}/README.md | 0 tests/test_suites/llm/common.env | 51 ++++++++ ...llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh | 38 ++++++ ...grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh | 38 ++++++ ...en2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh | 38 ++++++ ...po-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh | 38 ++++++ .../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh | 38 ++++++ ...rpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh | 38 ++++++ ...wen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh | 38 ++++++ .../test_suites}/llm/performance/.gitkeep | 0 .../sft-llama3.1-8b-instruct-1n8g-fsdp1.sh | 39 +++++++ ...llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh | 41 +++++++ ...ft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh | 41 +++++++ .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh | 39 +++++++ ...sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh | 41 +++++++ 39 files changed, 1616 insertions(+), 807 deletions(-) create mode 100644 examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml create mode 100644 examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml create mode 100644 examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml create mode 100644 examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml create mode 100644 examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml create mode 100644 examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml delete mode 100755 recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh delete mode 100755 recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh delete mode 100755 recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh delete mode 100755 recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh delete mode 100755 recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh delete mode 100755 recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh delete mode 100755 recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh delete mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh delete mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh delete mode 100755 recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh delete mode 100755 recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh delete mode 100755 recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh rename {recipes => tests/test_suites}/README.md (100%) create mode 100644 tests/test_suites/llm/common.env create mode 100755 tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh create mode 100755 tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh create mode 100755 tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh create mode 100755 tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh create mode 100755 tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh create mode 100755 tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh create mode 100755 tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh rename {recipes => tests/test_suites}/llm/performance/.gitkeep (100%) create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh create mode 100755 tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh create mode 100755 tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh create mode 100755 tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml new file mode 100644 index 0000000000..ba6ba255f3 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 500 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 4096 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 4096 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 128009 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 4096 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 128009 + model_name: meta-llama/Llama-3.1-8B-Instruct +data: + max_input_seq_length: 4096 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml new file mode 100644 index 0000000000..96e8e023cb --- /dev/null +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 32 + num_generations_per_prompt: 16 + max_rollout_turns: 1 + max_num_steps: 500 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.2-1B-Instruct + tokenizer: + name: meta-llama/Llama-3.2-1B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 4 + generation_batch_size: 32 + logprob_batch_size: 4 + max_total_sequence_length: 512 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 512 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 128009 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 512 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 128009 + model_name: meta-llama/Llama-3.2-1B-Instruct +data: + max_input_seq_length: 512 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1 + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml new file mode 100644 index 0000000000..3693ac4677 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 20 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-32B + tokenizer: + name: Qwen/Qwen2.5-32B + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 16384 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: true + tensor_parallel_size: 8 + make_sequence_length_divisible_by: 8 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 16384 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151643 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 4 + gpu_memory_utilization: 0.6 + max_model_len: 16384 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-32B +data: + max_input_seq_length: 16384 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 16 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml new file mode 100644 index 0000000000..aed12183a8 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 2 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-32B + tokenizer: + name: Qwen/Qwen2.5-32B + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 16384 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: true + tensor_parallel_size: 8 + make_sequence_length_divisible_by: 8 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 16384 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151643 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 4 + gpu_memory_utilization: 0.6 + max_model_len: 16384 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-32B +data: + max_input_seq_length: 16384 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 16 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml new file mode 100644 index 0000000000..27211ddc7e --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 30 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp1 + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-7B-Instruct + tokenizer: + name: Qwen/Qwen2.5-7B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 4096 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: false + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 4096 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151645 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 4096 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-7B-Instruct +data: + max_input_seq_length: 4096 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-fsdp1 + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-7b-instruct-4n8g-fsdp1 + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml new file mode 100644 index 0000000000..87e2c592c0 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 64 + num_generations_per_prompt: 32 + max_rollout_turns: 1 + max_num_steps: 30 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-7B-Instruct + tokenizer: + name: Qwen/Qwen2.5-7B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 1 + generation_batch_size: 32 + logprob_batch_size: 2 + max_total_sequence_length: 4096 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: false + tensor_parallel_size: 4 + make_sequence_length_divisible_by: 4 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 3e-07 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 4096 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151645 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 4 + gpu_memory_utilization: 0.6 + max_model_len: 4096 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-7B-Instruct +data: + max_input_seq_length: 4096 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml new file mode 100644 index 0000000000..9f5762f173 --- /dev/null +++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml @@ -0,0 +1,109 @@ +grpo: + num_prompts_per_step: 32 + num_generations_per_prompt: 16 + max_rollout_turns: 1 + max_num_steps: 450 + normalize_rewards: true + use_leave_one_out_baseline: true + val_period: 10 + val_at_start: false + max_val_samples: 256 + val_batch_size: 256 +loss_fn: + reference_policy_kl_penalty: 0.01 + ratio_eps_min: 0.2 + ratio_eps_max: 0.2 + use_on_policy_kl_approximation: false + use_importance_sampling_correction: false +checkpointing: + enabled: true + checkpoint_dir: results/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1 + metric_name: val_reward + higher_is_better: true + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-Math-1.5B-Instruct + tokenizer: + name: Qwen/Qwen2.5-Math-1.5B-Instruct + train_global_batch_size: 512 + train_micro_batch_size: 4 + generation_batch_size: 32 + logprob_batch_size: 4 + max_total_sequence_length: 512 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + refit_buffer_size_gb: 4 + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.01 + betas: + - 0.9 + - 0.999 + eps: 1e-08 + foreach: false + fused: false + scheduler: + - name: torch.optim.lr_scheduler.LinearLR + kwargs: + start_factor: 0.1 + end_factor: 1 + total_iters: 50 + - name: torch.optim.lr_scheduler.ConstantLR + kwargs: + factor: 1 + total_iters: 10000000000 + - milestones: + - 50 + generation: + backend: vllm + max_new_tokens: 512 + temperature: 1 + top_p: 1 + top_k: null + stop_token_ids: + - 151645 + stop_strings: null + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.6 + max_model_len: 512 + load_format: dummy + skip_tokenizer_init: true + pad_token_id: 151643 + model_name: Qwen/Qwen2.5-Math-1.5B-Instruct +data: + max_input_seq_length: 512 + prompt_file: examples/prompts/cot.txt + system_prompt_file: null + dataset_name: OpenMathInstruct-2 +env: + math: + num_workers: 8 +logger: + log_dir: logs/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1 + num_val_samples_to_print: 0 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1 + tensorboard: {} + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml new file mode 100644 index 0000000000..da0140a73e --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 250 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp1 + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 1024 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: false + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 1024 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp1 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.1-8b-instruct-1n8g-fsdp1 + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml new file mode 100644 index 0000000000..288f365c1a --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 2730 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 1024 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 1024 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml new file mode 100644 index 0000000000..f065b5cd34 --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 350 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.1-8B-Instruct + tokenizer: + name: meta-llama/Llama-3.1-8B-Instruct + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 1024 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: false + tensor_parallel_size: 2 + make_sequence_length_divisible_by: 2 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 1024 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml new file mode 100644 index 0000000000..7c4bd357ed --- /dev/null +++ b/examples/configs/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 500 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-llama3.2-1b-1n8g-fsdp2tp1 + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: meta-llama/Llama-3.2-1B + tokenizer: + name: meta-llama/Llama-3.2-1B + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 1024 + precision: float32 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: false + activation_checkpointing: false + tensor_parallel_size: 1 + make_sequence_length_divisible_by: 1 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 1024 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-llama3.2-1b-1n8g-fsdp2tp1 + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-llama3.2-1b-1n8g-fsdp2tp1 + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 1 diff --git a/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml new file mode 100644 index 0000000000..4cd1a5387c --- /dev/null +++ b/examples/configs/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.yaml @@ -0,0 +1,67 @@ +sft: + max_num_epochs: 1 + max_num_steps: 20 + val_period: 10 + val_batches: 8 + val_global_batch_size: 32 + val_micro_batch_size: 1 + val_at_start: true + seed: 42 +checkpointing: + enabled: true + checkpoint_dir: results/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt + metric_name: val_loss + higher_is_better: false + keep_top_k: 3 + save_period: 10 +policy: + model_name: Qwen/Qwen2.5-32B + tokenizer: + name: Qwen/Qwen2.5-32B + chat_template: '{% for message in messages %}{%- if message[''role''] == ''system'' %}{{''Context: '' + message[''content''].strip()}}{%- elif message[''role''] == ''user'' %}{{'' Question: '' + message[''content''].strip() + '' Answer:''}}{%- elif message[''role''] == ''assistant'' %}{{'' '' + message[''content''].strip()}}{%- endif %}{% endfor %}' + train_global_batch_size: 32 + train_micro_batch_size: 1 + max_total_sequence_length: 16000 + precision: bfloat16 + fsdp_offload_enabled: false + activation_checkpointing_enabled: false + dtensor_cfg: + enabled: true + cpu_offload: false + sequence_parallel: true + activation_checkpointing: true + tensor_parallel_size: 8 + make_sequence_length_divisible_by: 8 + max_grad_norm: 1 + optimizer: + name: torch.optim.AdamW + kwargs: + lr: 5e-06 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + eps: 1e-05 + foreach: false + fused: false +data: + max_input_seq_length: 16000 + dataset_name: squad + add_bos: true + add_eos: true +logger: + log_dir: logs/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt + wandb_enabled: true + tensorboard_enabled: true + monitor_gpus: true + wandb: + project: nemo-rl + name: sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt + tensorboard: + log_dir: tb_logs-sft-dev-squad + gpu_monitoring: + collection_interval: 10 + flush_interval: 10 +cluster: + gpus_per_node: 8 + num_nodes: 4 diff --git a/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh deleted file mode 100755 index 3feb431a2f..0000000000 --- a/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=4 -STEPS_PER_RUN=100 -MAX_STEPS=500 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=240 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_grpo_math.py \ - --config examples/configs/grpo_math_8B.yaml \ - policy.model_name=meta-llama/Llama-3.1-8B-Instruct \ - policy.dtensor_cfg.enabled=true \ - policy.dtensor_cfg.tensor_parallel_size=1 \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - grpo.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - python -u tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["100"] < 1.1' -fi - diff --git a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh deleted file mode 100755 index 3989e10c51..0000000000 --- a/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=1 -STEPS_PER_RUN=500 -MAX_STEPS=500 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=120 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_grpo_math.py \ - --config examples/configs/grpo_math_1B.yaml \ - policy.model_name=meta-llama/Llama-3.2-1B-Instruct \ - policy.dtensor_cfg.enabled=true \ - policy.dtensor_cfg.tensor_parallel_size=1 \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - grpo.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - python -u tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["500"] < 1.1' -fi - diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh deleted file mode 100755 index 0494e8a6d9..0000000000 --- a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=16 -STEPS_PER_RUN=10 -MAX_STEPS=20 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=240 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_grpo_math.py \ - --config examples/configs/grpo_math_8B.yaml \ - policy.model_name=Qwen/Qwen2.5-32B \ - policy.dtensor_cfg.enabled=True \ - policy.dtensor_cfg.tensor_parallel_size=8 \ - policy.dtensor_cfg.sequence_parallel=True \ - policy.dtensor_cfg.activation_checkpointing=True \ - policy.generation.vllm_cfg.tensor_parallel_size=4 \ - policy.max_total_sequence_length=16384 \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - grpo.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - python -u tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["20"] < 1.1' -fi - diff --git a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh deleted file mode 100755 index 0f7cf2ef76..0000000000 --- a/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=16 -STEPS_PER_RUN=2 # 40min: step_time: [1341, 801] -MAX_STEPS=2 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=60 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_grpo_math.py \ - --config examples/configs/grpo_math_8B.yaml \ - policy.model_name=Qwen/Qwen2.5-32B \ - policy.dtensor_cfg.enabled=True \ - policy.dtensor_cfg.tensor_parallel_size=8 \ - policy.dtensor_cfg.sequence_parallel=True \ - policy.dtensor_cfg.activation_checkpointing=True \ - policy.generation.vllm_cfg.tensor_parallel_size=4 \ - policy.max_total_sequence_length=16384 \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - grpo.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - python -u tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["2"] < 1.1' -fi - diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh deleted file mode 100755 index 22e69c307e..0000000000 --- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=4 -STEPS_PER_RUN=30 -MAX_STEPS=30 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=90 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_grpo_math.py \ - --config examples/configs/grpo_math_8B.yaml \ - policy.model_name=Qwen/Qwen2.5-7B-Instruct \ - policy.dtensor_cfg.enabled=false \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - grpo.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - python -u tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["30"] < 1.1' -fi - diff --git a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh deleted file mode 100755 index 6686df40a5..0000000000 --- a/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=4 -STEPS_PER_RUN=30 -MAX_STEPS=30 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=180 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_grpo_math.py \ - --config examples/configs/grpo_math_8B.yaml \ - policy.model_name=Qwen/Qwen2.5-7B-Instruct \ - policy.dtensor_cfg.enabled=True \ - policy.dtensor_cfg.tensor_parallel_size=4 \ - policy.dtensor_cfg.sequence_parallel=True \ - policy.generation.vllm_cfg.tensor_parallel_size=4 \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - grpo.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - python -u tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["30"] < 1.1' -fi - diff --git a/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh deleted file mode 100755 index 89bf673983..0000000000 --- a/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=1 -STEPS_PER_RUN=450 -MAX_STEPS=450 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=120 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_grpo_math.py \ - --config examples/configs/grpo_math_1B.yaml \ - policy.model_name=Qwen/Qwen2.5-Math-1.5B-Instruct \ - policy.dtensor_cfg.enabled=true \ - policy.dtensor_cfg.tensor_parallel_size=1 \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - grpo.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - python -u tests/check_metrics.py $JSON_METRICS \ - 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["100"] < 1.1' -fi - diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh deleted file mode 100755 index a1c68aa28a..0000000000 --- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=1 -STEPS_PER_RUN=250 -MAX_STEPS=250 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=30 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo/ -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_sft.py \ - --config examples/configs/sft.yaml \ - policy.model_name=meta-llama/Llama-3.1-8B-Instruct \ - policy.precision=bfloat16 \ - policy.dtensor_cfg.enabled=False \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - sft.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - # TODO: FIGURE OUT CORRECT METRICS - python -u tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 4' \ - 'data["train/loss"]["250"] < 0.5' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 60000' -fi diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh deleted file mode 100755 index f2e2e9f2e0..0000000000 --- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# TODO: @ashors real convergence run (dataset only has 2737) -# ===== BEGIN CONFIG ===== -NUM_NODES=1 -STEPS_PER_RUN=2730 -MAX_STEPS=2730 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=120 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_sft.py \ - --config examples/configs/sft.yaml \ - policy.model_name=meta-llama/Llama-3.1-8B-Instruct \ - policy.precision=bfloat16 \ - policy.dtensor_cfg.enabled=True \ - policy.dtensor_cfg.tensor_parallel_size=1 \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - sft.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/reinforcer/issues/263 -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - # TODO: FIGURE OUT CORRECT METRICS - python -u tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 5' \ - 'data["train/loss"]["2730"] < 0.3' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 45000' -fi diff --git a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh deleted file mode 100755 index bcff7b5a38..0000000000 --- a/recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=1 -STEPS_PER_RUN=350 -MAX_STEPS=350 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=45 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_sft.py \ - --config examples/configs/sft.yaml \ - policy.model_name=meta-llama/Llama-3.1-8B-Instruct \ - policy.precision=bfloat16 \ - policy.dtensor_cfg.enabled=True \ - policy.dtensor_cfg.tensor_parallel_size=2 \ - policy.dtensor_cfg.sequence_parallel=True \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - sft.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/reinforcer/issues/263 - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - # TODO: FIGURE OUT CORRECT METRICS - python -u tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 5' \ - 'data["train/loss"]["60"] < 0.5' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 45000' -fi diff --git a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh deleted file mode 100755 index 28028c1cdd..0000000000 --- a/recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# ===== BEGIN CONFIG ===== -NUM_NODES=1 -STEPS_PER_RUN=500 -MAX_STEPS=500 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=15 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_sft.py \ - --config examples/configs/sft.yaml \ - policy.model_name=meta-llama/Llama-3.2-1B \ - policy.dtensor_cfg.enabled=True \ - policy.dtensor_cfg.tensor_parallel_size=1 \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - sft.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - python -u tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 2.4' \ - 'data["train/loss"]["500"] < 0.5' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 25000' -fi - diff --git a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh deleted file mode 100755 index 9a2ca8f19f..0000000000 --- a/recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash -set -eou pipefail - -# TODO: this config can crash on OOM -# https://github.com/NVIDIA/reinforcer/issues/263 - -# ===== BEGIN CONFIG ===== -NUM_NODES=4 -STEPS_PER_RUN=20 # step_time ~ 29sec -MAX_STEPS=20 -NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up -NUM_MINUTES=30 -# ===== END CONFIG ===== - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -PROJECT_ROOT=$(realpath $SCRIPT_DIR/../..) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT - -EXP_NAME=$(basename $0 .sh) -EXP_DIR=$SCRIPT_DIR/$EXP_NAME -LOG_DIR=$EXP_DIR/logs -CKPT_DIR=$EXP_DIR/ckpts -JSON_METRICS=$EXP_DIR/metrics.json -RUN_LOG=$EXP_DIR/run.log -export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} - -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR - -# Early stopping to save compute if max step has been reached -STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) -if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then - echo "[INFO] Target step $MAX_STEPS reached, skipping run" - exit 0 -fi -echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" - -# Run the experiment -cd $PROJECT_ROOT -python -u examples/run_sft.py \ - --config examples/configs/sft.yaml \ - policy.model_name=Qwen/Qwen2.5-32B \ - policy.precision=bfloat16 \ - policy.dtensor_cfg.enabled=True \ - policy.dtensor_cfg.tensor_parallel_size=8 \ - policy.dtensor_cfg.sequence_parallel=True \ - policy.dtensor_cfg.activation_checkpointing=True \ - policy.max_total_sequence_length=16000 \ - cluster.num_nodes=$NUM_NODES \ - cluster.gpus_per_node=8 \ - sft.max_num_steps=$MAX_STEPS \ - logger.log_dir=$LOG_DIR \ - logger.wandb_enabled=True \ - logger.wandb.project=nemo-rl \ - logger.wandb.name=$EXP_NAME \ - logger.monitor_gpus=True \ - logger.tensorboard_enabled=True \ - checkpointing.enabled=True \ - checkpointing.checkpoint_dir=$CKPT_DIR \ - $@ \ - 2>&1 | tee $RUN_LOG - -# Convert tensorboard logs to json -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS - -# Only run metrics if the target step is reached -if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then - python -u tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["1"] < 1.5' \ - 'data["train/loss"]["20"] < 0.3' \ - 'max(data["ray/node.0.gpu.0.memory"]) < 35000' -fi diff --git a/recipes/README.md b/tests/test_suites/README.md similarity index 100% rename from recipes/README.md rename to tests/test_suites/README.md diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env new file mode 100644 index 0000000000..2fc1bb27be --- /dev/null +++ b/tests/test_suites/llm/common.env @@ -0,0 +1,51 @@ +#!/bin/bash +# Source this file before running test to setup +# +# source ./common.env +set -eou pipefail + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +# Mark the current repo as safe, since wandb fetchs metadata about the repo +git config --global --add safe.directory $PROJECT_ROOT +PROJECT_ROOT=$(git rev-parse --show-toplevel) + +get_max_steps_from_yaml() { + local top_key="$1" + local yaml_path="$2" + # Use python -c to import yaml, load the file, and print the specific value + # Access the dictionary using the provided top_key + python -c "import yaml; f=open('$yaml_path', 'r'); data=yaml.safe_load(f); print(data['$top_key']['max_num_steps']); f.close()" +} + +exit_if_max_steps_reached() { + # Early stopping to save compute if max step has been reached + STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) + if [[ $STEPS_SO_FAR -ge $MAX_STEPS ]]; then + echo "[INFO] Target step $MAX_STEPS reached, skipping run" + exit 0 + fi + echo "[INFO] Steps so far: $STEPS_SO_FAR, running till $MAX_STEPS steps" +} + +EXP_NAME=$(basename $0 .sh) +ALGO=$(cut -d'-' -f1 <<< $EXP_NAME) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +CKPT_DIR=$EXP_DIR/ckpts +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log + +# Test script has path: tests/test_suites/llm/${EXP_NAME}.sh +# where config has path: examples/configs/recipes/llm/${EXP_NAME}.yaml +# We will assume/check the path matches this pattern +CONFIG_PATH=$(echo $SCRIPT_DIR/${EXP_NAME}.yaml | sed 's#tests/test_suites/llm#examples/configs/recipes/llm#') +if [[ ! -f $CONFIG_PATH ]]; then + echo "[ERROR] Config file $CONFIG_PATH not found" + exit 1 +fi + +export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} + +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR + +MAX_STEPS=$(get_max_steps_from_yaml $ALGO $CONFIG_PATH) \ No newline at end of file diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh new file mode 100755 index 0000000000..b0f81e9886 --- /dev/null +++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh @@ -0,0 +1,38 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=100 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["100"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..3cf1a34eb5 --- /dev/null +++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh @@ -0,0 +1,38 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["500"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh new file mode 100755 index 0000000000..59b1d4f7d3 --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh @@ -0,0 +1,38 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=16 +STEPS_PER_RUN=10 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=240 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["20"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh new file mode 100755 index 0000000000..bf1dba7e85 --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh @@ -0,0 +1,38 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=16 +STEPS_PER_RUN=2 # 40min: step_time: [1341, 801] +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=60 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["2"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh new file mode 100755 index 0000000000..7e9ef5b050 --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh @@ -0,0 +1,38 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=90 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["30"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh new file mode 100755 index 0000000000..ba3fe1dd52 --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh @@ -0,0 +1,38 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=30 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=180 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["30"] < 1.1' +fi + diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..6f78bce178 --- /dev/null +++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh @@ -0,0 +1,38 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=450 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_grpo_math.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'mean(data["train/token_mult_prob_error"]) < 1.1' \ + 'data["train/token_mult_prob_error"]["100"] < 1.1' +fi + diff --git a/recipes/llm/performance/.gitkeep b/tests/test_suites/llm/performance/.gitkeep similarity index 100% rename from recipes/llm/performance/.gitkeep rename to tests/test_suites/llm/performance/.gitkeep diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh new file mode 100755 index 0000000000..048de3418c --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=250 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=30 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 4' \ + 'data["train/loss"]["250"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 60000' +fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh new file mode 100755 index 0000000000..f963a6c55d --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# TODO: @ashors real convergence run (dataset only has 2737) +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=2730 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=120 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/reinforcer/issues/263 +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 5' \ + 'data["train/loss"]["2730"] < 0.3' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 45000' +fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh new file mode 100755 index 0000000000..2bec6f9c8e --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=350 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=45 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/reinforcer/issues/263 + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + # TODO: FIGURE OUT CORRECT METRICS + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 5' \ + 'data["train/loss"]["60"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 45000' +fi diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh new file mode 100755 index 0000000000..2373889ecc --- /dev/null +++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh @@ -0,0 +1,39 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# ===== BEGIN CONFIG ===== +NUM_NODES=1 +STEPS_PER_RUN=500 +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=15 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 2.4' \ + 'data["train/loss"]["500"] < 0.5' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 25000' +fi + diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh new file mode 100755 index 0000000000..f7a9050c9c --- /dev/null +++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh @@ -0,0 +1,41 @@ +#!/bin/bash +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) +source $SCRIPT_DIR/common.env + +# TODO: this config can crash on OOM +# https://github.com/NVIDIA/reinforcer/issues/263 + +# ===== BEGIN CONFIG ===== +NUM_NODES=4 +STEPS_PER_RUN=20 # step_time ~ 29sec +NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up +NUM_MINUTES=30 +# ===== END CONFIG ===== + +exit_if_max_steps_reached + +# Run the experiment +cd $PROJECT_ROOT +uv run examples/run_sft.py \ + --config $CONFIG_PATH \ + logger.log_dir=$LOG_DIR \ + logger.wandb_enabled=True \ + logger.wandb.project=nemo-rl \ + logger.wandb.name=$EXP_NAME \ + logger.monitor_gpus=True \ + logger.tensorboard_enabled=True \ + checkpointing.enabled=True \ + checkpointing.checkpoint_dir=$CKPT_DIR \ + $@ \ + 2>&1 | tee $RUN_LOG + +# Convert tensorboard logs to json +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS + +# Only run metrics if the target step is reached +if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then + uv run tests/check_metrics.py $JSON_METRICS \ + 'data["train/loss"]["1"] < 1.5' \ + 'data["train/loss"]["20"] < 0.3' \ + 'max(data["ray/node.0.gpu.0.memory"]) < 35000' +fi From 1db320b9457d7e9bff0508e450b33509190bcc0d Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 16:02:44 -0700 Subject: [PATCH 10/19] add dryrun backdoor for tests Signed-off-by: Terry Kong --- tests/test_suites/llm/common.env | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env index 2fc1bb27be..af00aae410 100644 --- a/tests/test_suites/llm/common.env +++ b/tests/test_suites/llm/common.env @@ -46,6 +46,11 @@ fi export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR +MAX_STEPS=$(get_max_steps_from_yaml $ALGO $CONFIG_PATH) + +if [[ -n "$DRYRUN" ]]; then + echo "[INFO] DRYRUN mode: used for testing" + exit +fi -MAX_STEPS=$(get_max_steps_from_yaml $ALGO $CONFIG_PATH) \ No newline at end of file +mkdir -p $EXP_DIR $LOG_DIR $CKPT_DIR From 990ca45ab1b653a2a0fa36f00db7719d0616245b Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 16:29:21 -0700 Subject: [PATCH 11/19] revert maxsteps Signed-off-by: Terry Kong --- tests/test_suites/llm/common.env | 13 +------------ .../grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh | 2 ++ .../llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh | 2 ++ ...rpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh | 2 ++ .../grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh | 2 ++ .../llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh | 2 ++ .../llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh | 2 ++ ...grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh | 4 +++- .../llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh | 2 ++ .../sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh | 2 ++ .../llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh | 4 +++- .../llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh | 2 ++ .../llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh | 2 ++ 13 files changed, 27 insertions(+), 14 deletions(-) diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env index af00aae410..870b5af6bc 100644 --- a/tests/test_suites/llm/common.env +++ b/tests/test_suites/llm/common.env @@ -9,14 +9,6 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) git config --global --add safe.directory $PROJECT_ROOT PROJECT_ROOT=$(git rev-parse --show-toplevel) -get_max_steps_from_yaml() { - local top_key="$1" - local yaml_path="$2" - # Use python -c to import yaml, load the file, and print the specific value - # Access the dictionary using the provided top_key - python -c "import yaml; f=open('$yaml_path', 'r'); data=yaml.safe_load(f); print(data['$top_key']['max_num_steps']); f.close()" -} - exit_if_max_steps_reached() { # Early stopping to save compute if max step has been reached STEPS_SO_FAR=$(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS || echo 0) @@ -28,7 +20,6 @@ exit_if_max_steps_reached() { } EXP_NAME=$(basename $0 .sh) -ALGO=$(cut -d'-' -f1 <<< $EXP_NAME) EXP_DIR=$SCRIPT_DIR/$EXP_NAME LOG_DIR=$EXP_DIR/logs CKPT_DIR=$EXP_DIR/ckpts @@ -38,7 +29,7 @@ RUN_LOG=$EXP_DIR/run.log # Test script has path: tests/test_suites/llm/${EXP_NAME}.sh # where config has path: examples/configs/recipes/llm/${EXP_NAME}.yaml # We will assume/check the path matches this pattern -CONFIG_PATH=$(echo $SCRIPT_DIR/${EXP_NAME}.yaml | sed 's#tests/test_suites/llm#examples/configs/recipes/llm#') +CONFIG_PATH=$(echo $SCRIPT_DIR/${EXP_NAME}.yaml | sed 's#tests/test_suites#examples/configs/recipes#') if [[ ! -f $CONFIG_PATH ]]; then echo "[ERROR] Config file $CONFIG_PATH not found" exit 1 @@ -46,8 +37,6 @@ fi export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -MAX_STEPS=$(get_max_steps_from_yaml $ALGO $CONFIG_PATH) - if [[ -n "$DRYRUN" ]]; then echo "[INFO] DRYRUN mode: used for testing" exit diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh index b0f81e9886..6e64876058 100755 --- a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh +++ b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=4 STEPS_PER_RUN=100 +MAX_STEPS=500 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=240 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_grpo_math.py \ --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh index 3cf1a34eb5..45cfad6e83 100755 --- a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh +++ b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=1 STEPS_PER_RUN=500 +MAX_STEPS=500 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=120 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_grpo_math.py \ --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh index 59b1d4f7d3..69c9899ccd 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=16 STEPS_PER_RUN=10 +MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=240 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_grpo_math.py \ --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh index bf1dba7e85..ccdef1b2bd 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=16 STEPS_PER_RUN=2 # 40min: step_time: [1341, 801] +MAX_STEPS=2 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=60 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_grpo_math.py \ --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh index 7e9ef5b050..49c96a6f58 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=4 STEPS_PER_RUN=30 +MAX_STEPS=30 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=90 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_grpo_math.py \ --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh index ba3fe1dd52..b3071fb58e 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=4 STEPS_PER_RUN=30 +MAX_STEPS=30 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=180 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_grpo_math.py \ --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh index 6f78bce178..98df00c25c 100755 --- a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh +++ b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=1 STEPS_PER_RUN=450 +MAX_STEPS=450 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=120 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_grpo_math.py \ --config $CONFIG_PATH \ + grpo.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ @@ -33,6 +35,6 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then uv run tests/check_metrics.py $JSON_METRICS \ 'mean(data["train/token_mult_prob_error"]) < 1.1' \ - 'data["train/token_mult_prob_error"]["100"] < 1.1' + 'data["train/token_mult_prob_error"]["450"] < 1.1' fi diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh index 048de3418c..1e51c2a78f 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=1 STEPS_PER_RUN=250 +MAX_STEPS=250 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=30 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_sft.py \ --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh index f963a6c55d..1f937018a3 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh @@ -6,6 +6,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=1 STEPS_PER_RUN=2730 +MAX_STEPS=2730 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=120 # ===== END CONFIG ===== @@ -16,6 +17,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_sft.py \ --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh index 2bec6f9c8e..2379681138 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=1 STEPS_PER_RUN=350 +MAX_STEPS=350 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=45 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_sft.py \ --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ @@ -36,6 +38,6 @@ if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | ma # TODO: FIGURE OUT CORRECT METRICS uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["1"] < 5' \ - 'data["train/loss"]["60"] < 0.5' \ + 'data["train/loss"]["350"] < 0.5' \ 'max(data["ray/node.0.gpu.0.memory"]) < 45000' fi diff --git a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh index 2373889ecc..24b966c2af 100755 --- a/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh +++ b/tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh @@ -5,6 +5,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=1 STEPS_PER_RUN=500 +MAX_STEPS=500 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=15 # ===== END CONFIG ===== @@ -15,6 +16,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_sft.py \ --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh index f7a9050c9c..fd40a85764 100755 --- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh +++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh @@ -8,6 +8,7 @@ source $SCRIPT_DIR/common.env # ===== BEGIN CONFIG ===== NUM_NODES=4 STEPS_PER_RUN=20 # step_time ~ 29sec +MAX_STEPS=20 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=30 # ===== END CONFIG ===== @@ -18,6 +19,7 @@ exit_if_max_steps_reached cd $PROJECT_ROOT uv run examples/run_sft.py \ --config $CONFIG_PATH \ + sft.max_num_steps=$MAX_STEPS \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=True \ logger.wandb.project=nemo-rl \ From 33dd63484e443ec3f9f8aedb82bf870eaf724d52 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 17:03:06 -0700 Subject: [PATCH 12/19] fix tests Signed-off-by: Terry Kong --- tests/test_suites/llm/common.env | 5 ++-- tests/test_suites/nightly.txt | 18 +++++++------- tests/test_suites/release.txt | 6 ++--- tests/unit/test_recipes_and_test_suites.py | 28 ++++++++++++++++------ tools/launch | 8 +++---- 5 files changed, 40 insertions(+), 25 deletions(-) diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env index 870b5af6bc..918b7bff1b 100644 --- a/tests/test_suites/llm/common.env +++ b/tests/test_suites/llm/common.env @@ -5,8 +5,9 @@ set -eou pipefail SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) -# Mark the current repo as safe, since wandb fetchs metadata about the repo -git config --global --add safe.directory $PROJECT_ROOT +# Mark all repos as safe in the test context, since wandb fetchs metadata about the repo and it's a +# catch-22 to get the project root and mark it safe if you don't know the project root +git config --global --add safe.directory "*" PROJECT_ROOT=$(git rev-parse --show-toplevel) exit_if_max_steps_reached() { diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index 9b4eac9491..4c609d5bff 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -3,26 +3,26 @@ ######## # Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much -recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh -recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh +tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh +tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh # FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct) -recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh -recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh # Functional 32b run -recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh ####### # SFT # ####### # 1N 1B/8B runs -recipes/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh +tests/test_suites/llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh # Dtensor vs fsdp1 (8B) -recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh -recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh +tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh +tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp1.sh # Functional 32b test -recipes/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh +tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt index dfc997435b..69735cb0cb 100644 --- a/tests/test_suites/release.txt +++ b/tests/test_suites/release.txt @@ -3,14 +3,14 @@ ######## # Long 8b run -recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh +tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh # Long 32b run -recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh ####### # SFT # ####### # Long 8b convergence -recipes/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh \ No newline at end of file +tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh \ No newline at end of file diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 0214d1ca5e..5081692ea1 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -18,8 +18,6 @@ dir_path = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.abspath(os.path.join(dir_path, "..", "..")) -recipes_dir = os.path.join(project_root, "recipes") - test_suites_dir = os.path.join(project_root, "tests", "test_suites") nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt") @@ -120,15 +118,15 @@ def test_no_overlap_across_test_suites(all_test_suites): def test_all_recipes_accounted_for_in_test_suites(all_test_suites): all_recipes_in_test_suites = set(all_test_suites) - all_recipes_in_recipes_dir = set() + all_tests_in_test_suites_dir = set() for recipe_path in glob.glob( - os.path.join(recipes_dir, "**", "*.sh"), recursive=True + os.path.join(test_suites_dir, "**", "*.sh"), recursive=True ): # Strip off the project root and leading slash recipe_name = recipe_path[len(project_root) + 1 :] - all_recipes_in_recipes_dir.add(recipe_name) + all_tests_in_test_suites_dir.add(recipe_name) - assert all_recipes_in_test_suites == all_recipes_in_recipes_dir, ( + assert all_recipes_in_test_suites == all_tests_in_test_suites_dir, ( "All recipes are not accounted for in the test suites" ) @@ -172,7 +170,7 @@ def test_nightly_compute_stays_below_1024_hours(nightly_test_suite, tracker): def test_dry_run_does_not_fail_and_prints_total_gpu_hours(): - command = "DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./recipes/**/*.sh" + command = "DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch ./tests/test_suites/**/*.sh" # Run the command from the project root directory result = subprocess.run( @@ -200,3 +198,19 @@ def test_dry_run_does_not_fail_and_prints_total_gpu_hours(): assert last_line.startswith("[INFO]: Total GPU hours:"), ( f"Last line of output was not as expected: '{last_line}'" ) + + +def test_all_tests_can_find_config_if_dryrun(all_test_suites): + for test_suite in all_test_suites: + command = f"DRYRUN=1 {test_suite}" + result = subprocess.run( + command, + shell=True, + cwd=project_root, + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, ( + f"Command failed with exit code {result.returncode}" + ) diff --git a/tools/launch b/tools/launch index 1db03b3b03..4c76cee78d 100755 --- a/tools/launch +++ b/tools/launch @@ -40,11 +40,11 @@ extract_config() { echo "[ERROR]: Please add and update a section in the script with these variables:" echo echo "# ===== BEGIN CONFIG =====" - echo "NUM_NODES=1" # How many nodes this job uses - echo "STEPS_PER_RUN=60" # Approximately how many steps reached in one job - echo "MAX_STEPS=60" # Max training steps + echo "NUM_NODES=1 # How many nodes this job uses" + echo "STEPS_PER_RUN=60 # Approximately how many steps reached in one job" + echo "MAX_STEPS=60 # Max training steps" echo 'NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up' - echo "NUM_MINUTES=240" # How many minutes one job is (SLURM specific) + echo "NUM_MINUTES=240 # How many minutes one job is (SLURM specific)" echo "# ===== END CONFIG =====" return 1 fi 1>&2 From 7d347f62e08beaaf96963cf38a551cd62da60e4e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sat, 26 Apr 2025 00:00:07 -0700 Subject: [PATCH 13/19] test dryrun to not conflict Signed-off-by: Terry Kong --- tests/test_suites/llm/common.env | 4 ++-- tests/unit/test_recipes_and_test_suites.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_suites/llm/common.env b/tests/test_suites/llm/common.env index 918b7bff1b..c2008292b9 100644 --- a/tests/test_suites/llm/common.env +++ b/tests/test_suites/llm/common.env @@ -38,8 +38,8 @@ fi export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -if [[ -n "$DRYRUN" ]]; then - echo "[INFO] DRYRUN mode: used for testing" +if [[ -n "${TEST_DRYRUN:-}" ]]; then + echo "[INFO] TEST_DRYRUN mode: used for testing" exit fi diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 5081692ea1..edceba3649 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -202,7 +202,7 @@ def test_dry_run_does_not_fail_and_prints_total_gpu_hours(): def test_all_tests_can_find_config_if_dryrun(all_test_suites): for test_suite in all_test_suites: - command = f"DRYRUN=1 {test_suite}" + command = f"TEST_DRYRUN=1 {test_suite}" result = subprocess.run( command, shell=True, From b67c558cb09a67d9153c30a95e5868b1294131dd Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 27 Apr 2025 22:47:56 -0700 Subject: [PATCH 14/19] fix up packaging script to globstar Signed-off-by: Terry Kong --- tools/package_release_runs.sh | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tools/package_release_runs.sh b/tools/package_release_runs.sh index bf8aa1befc..bf8281d154 100755 --- a/tools/package_release_runs.sh +++ b/tools/package_release_runs.sh @@ -10,11 +10,26 @@ PROJECT_ROOT=$(realpath $SCRIPT_DIR/..) cd $PROJECT_ROOT set -eou pipefail +# Enable recursive globbing +shopt -s globstar + +OUTPUT_TAR="release_runs-$(git rev-parse --short HEAD).tar.gz" + +# Check if the glob expanded to any files +if [ -z "$(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events* 2>/dev/null || true)" ]; then + echo "Error: No tensorboard event files found matching the pattern." + exit 1 +elif [[ -f $OUTPUT_TAR ]]; then + echo "Error: $OUTPUT_TAR already exists. Clean it up before continuing." + exit 1 +fi -# Create a temporary directory TMP_DIR=$(mktemp -d) echo "Created temporary directory: $TMP_DIR" +# Set up trap to clean up temporary directory on exit +trap "echo 'Cleaning up temporary directory $TMP_DIR'; rm -rf $TMP_DIR" EXIT + # Loop over all the recipe runs and package them into a tarball for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do exp_name=$(basename -- $(cut -d/ -f3 <<<$tbevent) -logs) @@ -32,10 +47,5 @@ for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do done # Create a tarball of all the processed event files -OUTPUT_TAR="release_runs-$(git rev-parse --short HEAD).tar.gz" tar -czf "$OUTPUT_TAR" -C "$TMP_DIR" . echo "Created tarball: $OUTPUT_TAR" - -# Clean up the temporary directory -rm -rf "$TMP_DIR" -echo "Cleaned up temporary directory $TMP_DIR" From 36506abc0485550a82137ae7f8908d86114ebeed Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 27 Apr 2025 23:19:46 -0700 Subject: [PATCH 15/19] another fix Signed-off-by: Terry Kong --- tools/package_release_runs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/package_release_runs.sh b/tools/package_release_runs.sh index bf8281d154..b2570c1af5 100755 --- a/tools/package_release_runs.sh +++ b/tools/package_release_runs.sh @@ -32,7 +32,7 @@ trap "echo 'Cleaning up temporary directory $TMP_DIR'; rm -rf $TMP_DIR" EXIT # Loop over all the recipe runs and package them into a tarball for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do - exp_name=$(basename -- $(cut -d/ -f3 <<<$tbevent) -logs) + exp_name=$(basename -- $(cut -d/ -f2 <<<$tbevent) -logs) # Obfuscate the hostname # events.out.tfevents.1744822578..780899.0 obfuscated_event_path=$(basename $tbevent | awk -F. '{print $1"."$2"."$3"."$4".HOSTNAME."$(NF-1)"."$NF}') From bd352faf1f019753f8c88a7f9433e49e455280a6 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 27 Apr 2025 23:26:18 -0700 Subject: [PATCH 16/19] final fix Signed-off-by: Terry Kong --- tools/package_release_runs.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/package_release_runs.sh b/tools/package_release_runs.sh index b2570c1af5..357c9ad618 100755 --- a/tools/package_release_runs.sh +++ b/tools/package_release_runs.sh @@ -15,8 +15,10 @@ shopt -s globstar OUTPUT_TAR="release_runs-$(git rev-parse --short HEAD).tar.gz" +TB_EVENTS=$(ls code_snapshots/*/tests/test_suites/**/logs/*/tensorboard/events* || true) + # Check if the glob expanded to any files -if [ -z "$(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events* 2>/dev/null || true)" ]; then +if [ -z "$TB_EVENTS" ]; then echo "Error: No tensorboard event files found matching the pattern." exit 1 elif [[ -f $OUTPUT_TAR ]]; then @@ -31,7 +33,7 @@ echo "Created temporary directory: $TMP_DIR" trap "echo 'Cleaning up temporary directory $TMP_DIR'; rm -rf $TMP_DIR" EXIT # Loop over all the recipe runs and package them into a tarball -for tbevent in $(ls code_snapshots/*/recipes/**/logs/*/tensorboard/events*); do +for tbevent in $TB_EVENTS; do exp_name=$(basename -- $(cut -d/ -f2 <<<$tbevent) -logs) # Obfuscate the hostname # events.out.tfevents.1744822578..780899.0 From b0147def9fb95f16ee34312289f7348d000cbc79 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Mon, 28 Apr 2025 12:20:10 -0700 Subject: [PATCH 17/19] fix renaming missed Signed-off-by: Terry Kong --- docker/Dockerfile | 2 +- .../llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh | 2 +- .../test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh | 2 +- .../test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b1977a4ac9..2baf5d4ea3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -17,7 +17,7 @@ RUN chmod 755 /home/ray/.cache FROM base AS hermetic -WORKDIR /opt/reinforcer +WORKDIR /opt/nemo-rl # First copy only the dependency files COPY --chown=ray --chmod=755 pyproject.toml uv.lock ./ diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh index 1f937018a3..32bb6dacb7 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.sh @@ -32,7 +32,7 @@ uv run examples/run_sft.py \ # Convert tensorboard logs to json uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/reinforcer/issues/263 +# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/nemo-rl/issues/263 # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then # TODO: FIGURE OUT CORRECT METRICS diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh index 2379681138..ac441240fc 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.sh @@ -31,7 +31,7 @@ uv run examples/run_sft.py \ # Convert tensorboard logs to json uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/reinforcer/issues/263 +# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/nemo-rl/issues/263 # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh index fd40a85764..9fb5f7839b 100755 --- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh +++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.sh @@ -3,7 +3,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env # TODO: this config can crash on OOM -# https://github.com/NVIDIA/reinforcer/issues/263 +# https://github.com/NVIDIA/nemo-rl/issues/263 # ===== BEGIN CONFIG ===== NUM_NODES=4 From 0f8f5e86575548723b7cd083243ad5cd5715702e Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Mon, 28 Apr 2025 14:54:38 -0700 Subject: [PATCH 18/19] increase the test time a little and time the functional Signed-off-by: Terry Kong --- .github/workflows/cicd-main.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index e0f0a6532b..c38cc2dd87 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -150,7 +150,7 @@ jobs: if: ${{ needs.pre-flight.outputs.test_level != 'none' }} with: RUNNER: self-hosted-azure - TIMEOUT: 60 + TIMEOUT: 75 UNIT_TEST_SCRIPT: | cd /opt/nemo-rl if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L0|L1|L2)$ ]]; then @@ -168,10 +168,10 @@ jobs: FUNCTIONAL_TEST_SCRIPT: | cd /opt/nemo-rl if [[ "${{ needs.pre-flight.outputs.test_level }}" =~ ^(L1|L2)$ ]]; then - uv run --no-sync bash ./tests/functional/sft.sh - uv run --no-sync bash ./tests/functional/grpo.sh - uv run --no-sync bash ./tests/functional/grpo_multiturn.sh - uv run --no-sync bash ./tests/functional/dpo.sh + time uv run --no-sync bash ./tests/functional/sft.sh + time uv run --no-sync bash ./tests/functional/grpo.sh + time uv run --no-sync bash ./tests/functional/grpo_multiturn.sh + time uv run --no-sync bash ./tests/functional/dpo.sh else echo Skipping functional tests for level ${{ needs.pre-flight.outputs.test_level }} fi From 139c4cf79e2a61a074942c0e1bfea23347558168 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Mon, 28 Apr 2025 14:58:05 -0700 Subject: [PATCH 19/19] fix tests Signed-off-by: Terry Kong --- tests/functional/dpo.sh | 6 +++--- tests/functional/grpo.sh | 6 +++--- tests/functional/grpo_multiturn.sh | 19 ++++++++++--------- tests/functional/sft.sh | 6 +++--- 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh index e719f84b79..200a08cdd7 100755 --- a/tests/functional/dpo.sh +++ b/tests/functional/dpo.sh @@ -19,7 +19,7 @@ rm -rf $EXP_DIR $LOG_DIR mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT -python -u $PROJECT_ROOT/examples/run_dpo.py \ +uv run $PROJECT_ROOT/examples/run_dpo.py \ cluster.gpus_per_node=2 \ dpo.max_num_steps=3 \ dpo.val_batches=1 \ @@ -32,8 +32,8 @@ python -u $PROJECT_ROOT/examples/run_dpo.py \ $@ \ 2>&1 | tee $RUN_LOG -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -python -u tests/check_metrics.py $JSON_METRICS \ +uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["2"] < 0.694' \ diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh index 93b4ec25e1..bbbbd44a11 100755 --- a/tests/functional/grpo.sh +++ b/tests/functional/grpo.sh @@ -19,7 +19,7 @@ rm -rf $EXP_DIR $LOG_DIR mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT -python -u $PROJECT_ROOT/examples/run_grpo_math.py \ +uv run $PROJECT_ROOT/examples/run_grpo_math.py \ cluster.gpus_per_node=2 \ grpo.max_num_steps=3 \ logger.tensorboard_enabled=true \ @@ -29,8 +29,8 @@ python -u $PROJECT_ROOT/examples/run_grpo_math.py \ $@ \ 2>&1 | tee $RUN_LOG -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -python -u tests/check_metrics.py $JSON_METRICS \ +uv run tests/check_metrics.py $JSON_METRICS \ 'max(data["train/token_mult_prob_error"]) < 1.1' \ diff --git a/tests/functional/grpo_multiturn.sh b/tests/functional/grpo_multiturn.sh index ff9befcdd7..a22153c729 100755 --- a/tests/functional/grpo_multiturn.sh +++ b/tests/functional/grpo_multiturn.sh @@ -7,17 +7,19 @@ git config --global --add safe.directory $PROJECT_ROOT set -eou pipefail -LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs -JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json -RUN_LOG=$LOG_DIR/$(basename $0 .sh).log +EXP_NAME=$(basename $0 .sh) +EXP_DIR=$SCRIPT_DIR/$EXP_NAME +LOG_DIR=$EXP_DIR/logs +JSON_METRICS=$EXP_DIR/metrics.json +RUN_LOG=$EXP_DIR/run.log export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} -rm -rf $LOG_DIR -mkdir -p $LOG_DIR +rm -rf $EXP_DIR $LOG_DIR +mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT -python -u $PROJECT_ROOT/examples/run_grpo_sliding_puzzle.py \ +uv run $PROJECT_ROOT/examples/run_grpo_sliding_puzzle.py \ cluster.gpus_per_node=2 \ grpo.max_rollout_turns=10 \ grpo.max_num_steps=3 \ @@ -32,9 +34,8 @@ python -u $PROJECT_ROOT/examples/run_grpo_sliding_puzzle.py \ $@ \ 2>&1 | tee $RUN_LOG -cd $SCRIPT_DIR -python json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -python check_metrics.py $JSON_METRICS \ +uv run tests/check_metrics.py $JSON_METRICS \ 'max(data["train/token_mult_prob_error"]) < 1.1' \ diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh index 812733338c..90985ae2c1 100755 --- a/tests/functional/sft.sh +++ b/tests/functional/sft.sh @@ -22,7 +22,7 @@ rm -rf $EXP_DIR $LOG_DIR mkdir -p $EXP_DIR $LOG_DIR cd $PROJECT_ROOT -python -u $PROJECT_ROOT/examples/run_sft.py \ +uv run $PROJECT_ROOT/examples/run_sft.py \ policy.model_name=meta-llama/Llama-3.2-1B \ cluster.gpus_per_node=2 \ sft.max_num_steps=10 \ @@ -36,9 +36,9 @@ python -u $PROJECT_ROOT/examples/run_sft.py \ $@ \ 2>&1 | tee $RUN_LOG -python -u tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS +uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # TODO: loss is very noisy, this check is mainly for sanity of immediate divergence -python -u tests/check_metrics.py $JSON_METRICS \ +uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["9"] < 1500' \