From 51650bddde8addf117df5f77e3ccd8ba893f00c2 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 13 Apr 2025 23:15:15 -0700 Subject: [PATCH 1/3] fix: allow all ray ports to be configurable Signed-off-by: Terry Kong --- .gitignore | 1 + ray.sub | 26 ++++++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 4884d4dc81..e4d68f2e40 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ wandb/ checkpoints/ results/ code_snapshots/ +stress_test_results/ diff --git a/ray.sub b/ray.sub index f0f31fd4e0..8a8d7889cb 100644 --- a/ray.sub +++ b/ray.sub @@ -18,6 +18,18 @@ CONTAINER=$CONTAINER MOUNTS=$MOUNTS COMMAND=${COMMAND:-} # This is a script relative to the SLURM_SUBMIT_DIR. If left empty, it will leave the cluster idle after it's brought up. ######################################################## +# Ray ports +GCS_SERVER_PORT=${GCS_SERVER_PORT:-6379} +DASHBOARD_PORT=${DASHBOARD_PORT:-8265} +OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-8076} +NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-8077} +DASHBOARD_AGENT_PORT=${DASHBOARD_AGENT_PORT:-52365} +DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-52366} +METRICS_PORT=${METRICS_PORT:-9002} +# NOTE: Ports start above 20000 since 10001-10257 frequently ran into conflicts +MIN_WORKER_PORT=${MIN_WORKER_PORT:-20001} +MAX_WORKER_PORT=${MAX_WORKER_PORT:-20257} +######################################################## # Defaults to placing uv cache inside the SLURM_SUBMIT_DIR # This directory is mounted into the container at /home/ray/.cache/uv so it is shared between the head and worker nodes @@ -58,8 +70,7 @@ done head_node=${nodes_array[0]} head_node_ip=${ip_addresses_array[0]} -port=41993 -ip_head=$head_node_ip:$port +ip_head=$head_node_ip:$GCS_SERVER_PORT # First we start the head of the ray cluster on one of the physical nodes # Set GPU/CPU resources to 0 to avoid scheduling on the head node @@ -75,7 +86,13 @@ ray start --head \ --num-cpus=0 \ --num-gpus=0 \ --node-ip-address="$head_node_ip" \ ---port=$port \ +--port=${GCS_SERVER_PORT} \ +--dashboard-port=${DASHBOARD_PORT} \ +--object-manager-port=${OBJECT_MANAGER_PORT} \ +--node-manager-port=${NODE_MANAGER_PORT} \ +--metrics-export-port=${METRICS_PORT} \ +--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \ +--dashboard-agent-listen-port=${DASHBOARD_AGENT_PORT} \ --block EOFINNER chmod +x /launch-head.sh @@ -96,9 +113,6 @@ NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) # Start Ray worker nodes # We want 1 Ray worker node per physical node # Worker nodes are started with ray start but without the --head flag -# NOTE: Ports start above 20000 since 10001-10257 frequently ran into conflicts -min_worker_port=20001 -max_worker_port=20257 for ((i = 0; i < SLURM_JOB_NUM_NODES; i++)); do node_i=${nodes_array[$i]} From de677290c9958d0482c10f05f1df83779fbbca5a Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 13 Apr 2025 23:17:54 -0700 Subject: [PATCH 2/3] missing upper Signed-off-by: Terry Kong --- ray.sub | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ray.sub b/ray.sub index 8a8d7889cb..cc72295f2a 100644 --- a/ray.sub +++ b/ray.sub @@ -122,8 +122,8 @@ cat < Date: Sun, 13 Apr 2025 23:22:32 -0700 Subject: [PATCH 3/3] revert Signed-off-by: Terry Kong --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index e4d68f2e40..4884d4dc81 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,3 @@ wandb/ checkpoints/ results/ code_snapshots/ -stress_test_results/