diff --git a/ray.sub b/ray.sub index f0f31fd4e0..cc72295f2a 100644 --- a/ray.sub +++ b/ray.sub @@ -18,6 +18,18 @@ CONTAINER=$CONTAINER MOUNTS=$MOUNTS COMMAND=${COMMAND:-} # This is a script relative to the SLURM_SUBMIT_DIR. If left empty, it will leave the cluster idle after it's brought up. ######################################################## +# Ray ports +GCS_SERVER_PORT=${GCS_SERVER_PORT:-6379} +DASHBOARD_PORT=${DASHBOARD_PORT:-8265} +OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-8076} +NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-8077} +DASHBOARD_AGENT_PORT=${DASHBOARD_AGENT_PORT:-52365} +DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-52366} +METRICS_PORT=${METRICS_PORT:-9002} +# NOTE: Ports start above 20000 since 10001-10257 frequently ran into conflicts +MIN_WORKER_PORT=${MIN_WORKER_PORT:-20001} +MAX_WORKER_PORT=${MAX_WORKER_PORT:-20257} +######################################################## # Defaults to placing uv cache inside the SLURM_SUBMIT_DIR # This directory is mounted into the container at /home/ray/.cache/uv so it is shared between the head and worker nodes @@ -58,8 +70,7 @@ done head_node=${nodes_array[0]} head_node_ip=${ip_addresses_array[0]} -port=41993 -ip_head=$head_node_ip:$port +ip_head=$head_node_ip:$GCS_SERVER_PORT # First we start the head of the ray cluster on one of the physical nodes # Set GPU/CPU resources to 0 to avoid scheduling on the head node @@ -75,7 +86,13 @@ ray start --head \ --num-cpus=0 \ --num-gpus=0 \ --node-ip-address="$head_node_ip" \ ---port=$port \ +--port=${GCS_SERVER_PORT} \ +--dashboard-port=${DASHBOARD_PORT} \ +--object-manager-port=${OBJECT_MANAGER_PORT} \ +--node-manager-port=${NODE_MANAGER_PORT} \ +--metrics-export-port=${METRICS_PORT} \ +--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \ +--dashboard-agent-listen-port=${DASHBOARD_AGENT_PORT} \ --block EOFINNER chmod +x /launch-head.sh @@ -96,9 +113,6 @@ NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) # Start Ray worker nodes # We want 1 Ray worker node per physical node # Worker nodes are started with ray start but without the --head flag -# NOTE: Ports start above 20000 since 10001-10257 frequently ran into conflicts -min_worker_port=20001 -max_worker_port=20257 for ((i = 0; i < SLURM_JOB_NUM_NODES; i++)); do node_i=${nodes_array[$i]} @@ -108,8 +122,8 @@ cat <