From d1dc8c4d749fa057f35dbaf061b8593918008e5b Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Tue, 30 Sep 2025 16:27:59 -0700 Subject: [PATCH 1/3] improve CI stability with docker --- runners/launch_b200-nv.sh | 2 +- runners/launch_b200-nvd.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 85efecf96..243e624f9 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -17,7 +17,7 @@ srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ ---container-mount-home \ +--no-container-mount-home --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index 14be9fffc..732c670e8 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -19,13 +19,13 @@ if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; fi set -x -docker run --rm -d --network host --name $server_name \ +docker run --rm -d --init --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $CACHE_DIR:/workspace/flashinfer_cache \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \ --e FLASHINFER_WORKSPACE_BASE=/workspace/flashinfer_cache \ +-e FLASHINFER_WORKSPACE_BASE=/workspace/flashinfer_cache -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \ From 1d19374f812b295c40b2d87d12913c819d0ff703 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Tue, 30 Sep 2025 16:54:02 -0700 Subject: [PATCH 2/3] add comment on docker init --- runners/launch_b200-nvd.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index 732c670e8..566ffe4b2 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -19,6 +19,9 @@ if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; fi set -x +# Use --init flag to run an init process (PID 1) inside container for better signal handling and zombie process cleanup +# https://www.paolomainardi.com/posts/docker-run-init/ + docker run --rm -d --init --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ From d825e0752b3271b1c218658bec5ed893d40a2d59 Mon Sep 17 00:00:00 2001 From: Kedar Potdar Date: Tue, 30 Sep 2025 17:02:22 -0700 Subject: [PATCH 3/3] add comments --- runners/launch_b200-nvd.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index 566ffe4b2..0690c8340 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -20,7 +20,12 @@ fi set -x # Use --init flag to run an init process (PID 1) inside container for better signal handling and zombie process cleanup -# https://www.paolomainardi.com/posts/docker-run-init/ +# Ref: https://www.paolomainardi.com/posts/docker-run-init/ + +# NCCL_GRAPH_REGISTER tries to automatically enable user buffer registration with CUDA Graphs. +# Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. +# Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register + docker run --rm -d --init --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \