diff --git a/README.md b/README.md index 044c9cd954..f08ea94de5 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,6 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S) # SFT experiment uses Llama-3.1-8B model COMMAND="uv pip install -e .; uv run ./examples/run_sft.py --config examples/configs/sft.yaml cluster.num_nodes=2 cluster.gpus_per_node=8 checkpointing.checkpoint_dir='results/sft_llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='sft-llama8b'" \ -RAY_DEDUP_LOGS=0 \ UV_CACHE_DIR=YOUR_UV_CACHE_DIR \ CONTAINER=YOUR_CONTAINER \ MOUNTS="$PWD:$PWD" \ @@ -160,7 +159,6 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S) # grpo_math_8b uses Llama-3.1-8B-Instruct model COMMAND="uv pip install -e .; uv run ./examples/run_grpo_math.py --config examples/configs/grpo_math_8B.yaml cluster.num_nodes=2 checkpointing.checkpoint_dir='results/llama8b_2nodes' logger.wandb_enabled=True logger.wandb.name='grpo-llama8b_math'" \ -RAY_DEDUP_LOGS=0 \ UV_CACHE_DIR=YOUR_UV_CACHE_DIR \ CONTAINER=YOUR_CONTAINER \ MOUNTS="$PWD:$PWD" \ diff --git a/docs/cluster.md b/docs/cluster.md index d683de9ac2..c949b5eb77 100644 --- a/docs/cluster.md +++ b/docs/cluster.md @@ -15,7 +15,6 @@ NUM_ACTOR_NODES=1 # Total nodes requested (head is colocated on ray-worker-0) COMMAND="uv pip install -e .; uv run ./examples/run_grpo_math.py" \ -RAY_DEDUP_LOGS=0 \ CONTAINER=YOUR_CONTAINER \ MOUNTS="$PWD:$PWD" \ sbatch \ @@ -66,7 +65,6 @@ To run interactively, launch the same command as the [Batched Job Submission](#b # Run from the root of NeMo-Reinforcer repo NUM_ACTOR_NODES=1 # Total nodes requested (head is colocated on ray-worker-0) -RAY_DEDUP_LOGS=0 \ CONTAINER=YOUR_CONTAINER \ MOUNTS="$PWD:$PWD" \ sbatch \ diff --git a/nemo_reinforcer/utils/venvs.py b/nemo_reinforcer/utils/venvs.py index 12b1002810..e1cae61e0f 100644 --- a/nemo_reinforcer/utils/venvs.py +++ b/nemo_reinforcer/utils/venvs.py @@ -15,6 +15,7 @@ import subprocess import shlex import logging +from functools import lru_cache dir_path = os.path.dirname(os.path.abspath(__file__)) git_root = os.path.abspath(os.path.join(dir_path, "../..")) @@ -23,12 +24,16 @@ logger = logging.getLogger(__name__) +@lru_cache(maxsize=None) def create_local_venv(py_executable: str, venv_name: str) -> str: """Create a virtual environment using uv and execute a command within it. The output can be used as a py_executable for a Ray worker assuming the worker nodes also have access to the same file system as the head node. + This function is cached to avoid multiple calls to uv to create the same venv, + which avoids duplicate logging. + Args: py_executable (str): Command to run with the virtual environment (e.g., "uv.sh run --locked") venv_name (str): Name of the virtual environment (e.g., "foobar.Worker") diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh index faaed1903c..497a240d84 100755 --- a/tests/functional/grpo.sh +++ b/tests/functional/grpo.sh @@ -10,7 +10,6 @@ set -eou pipefail LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json RUN_LOG=$LOG_DIR/$(basename $0 .sh).log -export RAY_DEDUP_LOGS=0 export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh index 82d263c9da..c2989a4808 100755 --- a/tests/functional/sft.sh +++ b/tests/functional/sft.sh @@ -10,7 +10,6 @@ set -eou pipefail LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json RUN_LOG=$LOG_DIR/$(basename $0 .sh).log -export RAY_DEDUP_LOGS=0 export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} diff --git a/tests/run_unit.sh b/tests/run_unit.sh index f51ff49ff6..093583faf9 100755 --- a/tests/run_unit.sh +++ b/tests/run_unit.sh @@ -29,7 +29,6 @@ if ! ray status &>/dev/null; then fi export PYTHONPATH=$(realpath ${SCRIPT_DIR}/..):${PYTHONPATH:-} -export RAY_DEDUP_LOGS=0 # Run unit tests echo "Running unit tests..."