Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1669,6 +1669,24 @@ dsr1-fp4-b200-sglang:
- { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }

dsv4-fp4-b200-sglang:
image: lmsysorg/sglang:deepseek-v4-blackwell
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200
precision: fp4
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }

# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
# B200 SGLang recipe as-is until B300-specific tuning is available.
Expand Down
79 changes: 79 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b200.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

nvidia-smi

# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV,
# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to
# all ranks.
unset CUDA_VISIBLE_DEVICES

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-30000}

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

set -x
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 \
SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 \
sglang serve \
--trust-remote-code \
--model-path $MODEL \
--tp 8 \
--dp 8 \
--enable-dp-attention \
--moe-a2a-backend deepep \
--speculative-algo EAGLE \
--speculative-num-steps 1 \
--speculative-eagle-topk 1 \
--speculative-num-draft-tokens 2 \
--mem-fraction-static 0.82 \
--cuda-graph-max-bs 64 \
--max-running-requests 128 \
--deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \
--host 0.0.0.0 \
--port $PORT > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend sglang \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $((CONC * 10)) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/"

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
- config-keys:
- dsv4-fp4-b200-sglang
description:
- "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)"
- "Container: lmsysorg/sglang:deepseek-v4-blackwell"
- "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
- "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config"
- "Prefix caching and speculative decoding disabled for baseline numbers"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131

- config-keys:
- dsr1-fp8-h100-dynamo-trt
- dsr1-fp8-h100-dynamo-sglang
Expand Down
18 changes: 14 additions & 4 deletions runners/launch_b200-dgxc-slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -249,13 +249,23 @@ EOF

else

HF_HUB_CACHE_MOUNT="/scratch/fsw/models"
export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
LOCK_FILE="${SQUASH_FILE}.lock"

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
# breaks `import sglang`. Mount this one image at /ix instead; drop the
# conditional once the image stops installing editable under /workspace.
if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
CONTAINER_MOUNT_DIR=/ix
else
CONTAINER_MOUNT_DIR=/workspace
fi

salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)

Expand All @@ -276,9 +286,9 @@ else

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--no-container-mount-home \
--container-workdir=/workspace/ \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888 \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
fi
16 changes: 13 additions & 3 deletions runners/launch_b200-nb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,24 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')

UCX_NET_DEVICES=eth0

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
# breaks `import sglang`. Mount this one image at /ix instead; drop the
# conditional once the image stops installing editable under /workspace.
if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
CONTAINER_MOUNT_DIR=/ix
else
CONTAINER_MOUNT_DIR=/workspace
fi

set -x
srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \
--container-image=$IMAGE \
--container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--no-container-mount-home \
--container-remap-root \
--container-writable \
--container-workdir=/workspace/ \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh