diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh index 85efecf96..243e624f9 100644 --- a/runners/launch_b200-nv.sh +++ b/runners/launch_b200-nv.sh @@ -17,7 +17,7 @@ srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ ---container-mount-home \ +--no-container-mount-home --container-writable \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL \ bash benchmarks/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_slurm.sh diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index 14be9fffc..0690c8340 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -19,13 +19,21 @@ if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; fi set -x -docker run --rm -d --network host --name $server_name \ +# Use --init flag to run an init process (PID 1) inside container for better signal handling and zombie process cleanup +# Ref: https://www.paolomainardi.com/posts/docker-run-init/ + +# NCCL_GRAPH_REGISTER tries to automatically enable user buffer registration with CUDA Graphs. +# Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. +# Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register + + +docker run --rm -d --init --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $CACHE_DIR:/workspace/flashinfer_cache \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ -e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT \ --e FLASHINFER_WORKSPACE_BASE=/workspace/flashinfer_cache \ +-e FLASHINFER_WORKSPACE_BASE=/workspace/flashinfer_cache -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --entrypoint=/bin/bash \ $(echo "$IMAGE" | sed 's/#/\//') \