From fd86318271bfa5b5c0fa686156c4b126a74194fe Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 6 Apr 2025 23:23:09 -0700 Subject: [PATCH 1/4] fix: default to less verbose logging + uv-venv log once per worker Signed-off-by: Terry Kong --- README.md | 2 -- docs/cluster.md | 2 -- tests/functional/grpo.sh | 1 - tests/functional/sft.sh | 1 - tests/run_unit.sh | 1 - 5 files changed, 7 deletions(-) diff --git a/README.md b/README.md index 044c9cd954..f08ea94de5 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,6 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S) # SFT experiment uses Llama-3.1-8B model COMMAND="uv pip install -e .; uv run ./examples/run_sft.py --config examples/configs/sft.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 checkpointing.checkpoint_dir='results/sft_llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='sft-llama8b'" \ -RAY_DEDUP_LOGS=0 \ UV_CACHE_DIR=YOUR_UV_CACHE_DIR \ CONTAINER=YOUR_CONTAINER \ MOUNTS="$PWD:$PWD" \ @@ -160,7 +159,6 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S) # grpo_math_8b uses Llama-3.1-8B-Instruct model COMMAND="uv pip install -e .; uv run ./examples/run_grpo_math.py --config examples/configs/grpo_math_8B.yaml cluster.num_nodes=2 checkpointing.checkpoint_dir='results/llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='grpo-llama8b_math'" \ -RAY_DEDUP_LOGS=0 \ UV_CACHE_DIR=YOUR_UV_CACHE_DIR \ CONTAINER=YOUR_CONTAINER \ MOUNTS="$PWD:$PWD" \ diff --git a/docs/cluster.md b/docs/cluster.md index d683de9ac2..c949b5eb77 100644 --- a/docs/cluster.md +++ b/docs/cluster.md @@ -15,7 +15,6 @@ NUM_ACTOR_NODES=1 # Total nodes requested (head is colocated on ray-worker-0) COMMAND="uv pip install -e .; uv run ./examples/run_grpo_math.py" \ -RAY_DEDUP_LOGS=0 \ CONTAINER=YOUR_CONTAINER \ MOUNTS="$PWD:$PWD" \ sbatch \ @@ -66,7 +65,6 @@ To run interactively, launch the same command as the [Batched Job Submission](#b # Run from the root of NeMo-Reinforcer repo NUM_ACTOR_NODES=1 # Total nodes requested (head is colocated on ray-worker-0) -RAY_DEDUP_LOGS=0 \ CONTAINER=YOUR_CONTAINER \ MOUNTS="$PWD:$PWD" \ sbatch \ diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh index faaed1903c..497a240d84 100755 --- a/tests/functional/grpo.sh +++ b/tests/functional/grpo.sh @@ -10,7 +10,6 @@ set -eou pipefail LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json RUN_LOG=$LOG_DIR/$(basename $0 .sh).log -export RAY_DEDUP_LOGS=0 export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh index 82d263c9da..c2989a4808 100755 --- a/tests/functional/sft.sh +++ b/tests/functional/sft.sh @@ -10,7 +10,6 @@ set -eou pipefail LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json RUN_LOG=$LOG_DIR/$(basename $0 .sh).log -export RAY_DEDUP_LOGS=0 export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} diff --git a/tests/run_unit.sh b/tests/run_unit.sh index f51ff49ff6..093583faf9 100755 --- a/tests/run_unit.sh +++ b/tests/run_unit.sh @@ -29,7 +29,6 @@ if ! ray status &>/dev/null; then fi export PYTHONPATH=$(realpath ${SCRIPT_DIR}/..):${PYTHONPATH:-} -export RAY_DEDUP_LOGS=0 # Run unit tests echo "Running unit tests..." From b6bd50696cdf8e9ea2d83c014ffebf204adf7988 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 6 Apr 2025 23:34:37 -0700 Subject: [PATCH 2/4] gooo Signed-off-by: Terry Kong --- .gitignore | 1 + nemo_reinforcer/utils/venvs.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 4884d4dc81..e4d68f2e40 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ wandb/ checkpoints/ results/ code_snapshots/ +stress_test_results/ diff --git a/nemo_reinforcer/utils/venvs.py b/nemo_reinforcer/utils/venvs.py index 12b1002810..772a5ad5a9 100644 --- a/nemo_reinforcer/utils/venvs.py +++ b/nemo_reinforcer/utils/venvs.py @@ -15,6 +15,7 @@ import subprocess import shlex import logging +from functools import lru_cache dir_path = os.path.dirname(os.path.abspath(__file__)) git_root = os.path.abspath(os.path.join(dir_path, "../..")) @@ -23,12 +24,17 @@ logger = logging.getLogger(__name__) +@lru_cache(maxsize=None) def create_local_venv(py_executable: str, venv_name: str) -> str: """Create a virtual environment using uv and execute a command within it. The output can be used as a py_executable for a Ray worker assuming the worker nodes also have access to the same file system as the head node. + This function is cached to avoid multiple calls to uv to create the same venv. + This is mainly to avoid the duplicate logging of the venv creation since it is + safe to call this multiple times. + Args: py_executable (str): Command to run with the virtual environment (e.g., "uv.sh run --locked") venv_name (str): Name of the virtual environment (e.g., "foobar.Worker") From 3d883005c2231333d072bacc820875344846d9d4 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 6 Apr 2025 23:38:01 -0700 Subject: [PATCH 3/4] revert Signed-off-by: Terry Kong --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index e4d68f2e40..4884d4dc81 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,3 @@ wandb/ checkpoints/ results/ code_snapshots/ -stress_test_results/ From f1f24c504ba52003e2e1a712fa20f7fb75dac220 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 6 Apr 2025 23:38:54 -0700 Subject: [PATCH 4/4] succinct Signed-off-by: Terry Kong --- nemo_reinforcer/utils/venvs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nemo_reinforcer/utils/venvs.py b/nemo_reinforcer/utils/venvs.py index 772a5ad5a9..e1cae61e0f 100644 --- a/nemo_reinforcer/utils/venvs.py +++ b/nemo_reinforcer/utils/venvs.py @@ -31,9 +31,8 @@ def create_local_venv(py_executable: str, venv_name: str) -> str: The output can be used as a py_executable for a Ray worker assuming the worker nodes also have access to the same file system as the head node. - This function is cached to avoid multiple calls to uv to create the same venv. - This is mainly to avoid the duplicate logging of the venv creation since it is - safe to call this multiple times. + This function is cached to avoid multiple calls to uv to create the same venv, + which avoids duplicate logging. Args: py_executable (str): Command to run with the virtual environment (e.g., "uv.sh run --locked")