diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 245b74762..98080efea 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -24,6 +24,8 @@ b200: - 'b200-nvd_1' - 'b200-nvd_2' - 'b200-nvd_3' +- 'b200-dgxc_1' +- 'b200-dgxc_2' mi300x: - 'mi300x-amd_0' - 'mi300x-amd_1' diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh new file mode 100644 index 000000000..4d8ec0aed --- /dev/null +++ b/runners/launch_b200-dgxc.sh @@ -0,0 +1,65 @@ +#!/usr/bin/bash + +HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/" +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +PORT=8888 + +# Create unique cache directory based on model parameters +MODEL_NAME=$(basename "$MODEL") + +server_name="bmk-server" + +nvidia-smi + +# GPUs must be idle +if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then + echo "[ERROR] GPU busy from previous run"; nvidia-smi; exit 1 +fi + +set -x +# Use --init flag to run an init process (PID 1) inside container for better signal handling and zombie process cleanup +# Ref: https://www.paolomainardi.com/posts/docker-run-init/ + +# NCCL_GRAPH_REGISTER tries to automatically enable user buffer registration with CUDA Graphs. +# Disabling it can reduce perf but will improve CI stability. i.e. we won't see vLLM/Sglang crashes. +# Ref: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register + +if [[ "$MODEL" == "nvidia/DeepSeek-R1-0528-FP4" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + export NUM_PROMPTS=$(( CONC * 20 )) + else + export NUM_PROMPTS=$(( CONC * 50 )) + fi +else + export NUM_PROMPTS=$(( CONC * 10 )) +fi + +docker run --rm --init --network host --name $server_name \ +--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ +-e NCCL_GRAPH_REGISTER=0 \ +-e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ +-e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \ +--entrypoint=/bin/bash \ +$(echo "$IMAGE" | sed 's/#/\//') \ +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}_docker.sh" + +# Try graceful first +docker stop -t 90 "$server_name" || true +# Wait until it's really dead +docker wait "$server_name" >/dev/null 2>&1 || true +# Force remove if anything lingers +docker rm -f "$server_name" >/dev/null 2>&1 || true + +# Give a moment for GPU processes to fully terminate +sleep 2 +# Verify GPUs are now idle; if not, print diag and (optionally) reset +if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then + echo "[WARN] After stop, GPU still busy:"; nvidia-smi + # Last resort if driver allows and GPUs appear idle otherwise: + #nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true +fi + +nvidia-smi diff --git a/runners/launch_b200-nvd.sh b/runners/launch_b200-nvd.sh index c5216b006..ebfa67458 100644 --- a/runners/launch_b200-nvd.sh +++ b/runners/launch_b200-nvd.sh @@ -39,7 +39,7 @@ docker run --rm --init --network host --name $server_name \ --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ -v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ --e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e ISL -e OSL -e PORT=$PORT -e EP_SIZE -e DP_ATTENTION \ -e NCCL_GRAPH_REGISTER=0 \ -e TORCH_CUDA_ARCH_LIST="10.0" -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ -e PYTHONPYCACHEPREFIX=/tmp/pycache/ -e RESULT_FILENAME -e RANDOM_RANGE_RATIO -e NUM_PROMPTS \