Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
4a70e24
Add DSv4 B200 configs
wzhao18 Apr 25, 2026
6ee148f
Add changelog
wzhao18 Apr 25, 2026
707f225
fixup
wzhao18 Apr 25, 2026
8ec1310
fixup
wzhao18 Apr 25, 2026
81594d7
fixup
wzhao18 Apr 25, 2026
f2fcfae
fix runners
wzhao18 Apr 25, 2026
98d83ff
Simplify MAX_NUM_BATCHED_TOKENS calculation
wzhao18 Apr 26, 2026
d7df79a
Merge branch 'main' into nv/dsv4-b200-agg
wzhao18 Apr 26, 2026
c18f413
Update perf-changelog.yaml with benchmark changes
wzhao18 Apr 26, 2026
1e95b00
Merge branch 'main' into nv/dsv4-b200-agg
wzhao18 Apr 26, 2026
5bab835
Modify MAX_NUM_BATCHED_TOKENS logic
wzhao18 Apr 26, 2026
e28c638
Fix variable name from 'concurrency' to 'CONC'
wzhao18 Apr 26, 2026
44729b1
Merge branch 'main' into nv/dsv4-b200-agg
wzhao18 Apr 26, 2026
b653b7f
Merge branch 'main' into nv/dsv4-b200-agg
Oseltamivir Apr 26, 2026
e86d6e9
Revise perf-changelog.yaml with new benchmarks
wzhao18 Apr 26, 2026
a222193
Update perf-changelog.yaml
wzhao18 Apr 26, 2026
34d9bc3
Merge branch 'main' into nv/dsv4-b200-agg
wzhao18 Apr 26, 2026
3f038c4
lower batch size to fix OOM
wzhao18 Apr 26, 2026
fae14d9
Remove GPU memory utilization for DP mode
wzhao18 Apr 26, 2026
4d99225
Merge branch 'main' into nv/dsv4-b200-agg
wzhao18 Apr 26, 2026
acb8510
Introduce GMU_ARGS for GPU memory utilization
wzhao18 Apr 27, 2026
53b7f59
Merge branch 'main' into nv/dsv4-b200-agg
Oseltamivir Apr 27, 2026
9c2f8c9
Merge branch 'main' into nv/dsv4-b200-agg
Oseltamivir Apr 27, 2026
984064a
Add low-latency configs
wzhao18 Apr 27, 2026
09599ae
Merge branch 'main' into nv/dsv4-b200-agg
Ankur-singh Apr 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1706,6 +1706,27 @@ dsv4-fp4-b200-sglang:
# DP-attention (DP_ATTENTION=true) — max-throughput CONC range
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }

dsv4-fp4-b200-vllm:
image: vllm/vllm-openai:deepseekv4-cu130
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
precision: fp4
framework: vllm
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 128, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 4096 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }

# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
# B200 SGLang recipe as-is until B300-specific tuning is available.
Expand Down
120 changes: 120 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b200_vllm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env bash

# DeepSeek-V4-Pro B200 single-node vLLM recipe derived from the B200 pareto
# sweep. TP mode (dp-attn=false) runs without expert parallel; DP mode
# (dp-attn=true) enables expert parallel (EP_SIZE=TP value = DP size).
Comment thread
cquil11 marked this conversation as resolved.

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

# DeepSeek-V4-Pro weights are large; engine startup can exceed the default
# 600s. Give it an hour to load.
export VLLM_ENGINE_READY_TIMEOUT_S=3600

PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1)
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
fi

EP_ARGS=()
if [ "${EP_SIZE:-1}" -gt 1 ]; then
EP_ARGS=(--enable-expert-parallel)
fi

GMU_ARGS=()
if [ "${DP_ATTENTION}" = "true" ]; then
GMU_ARGS=(--gpu-memory-utilization 0.85)
fi

if [ "${ISL}" -eq 8192 ] && [ "${CONC}" -le 128 ]; then
MAX_NUM_BATCHED_TOKENS=${ISL}
else
MAX_NUM_BATCHED_TOKENS=2048
fi

BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then
BENCHMARK_MAX_MODEL_LEN=4096
fi

if [ "${EVAL_ONLY}" = "true" ]; then
EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN")
export EVAL_MAX_MODEL_LEN
SERVE_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
else
SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN"
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

set -x
vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
"${PARALLEL_ARGS[@]}" \
--pipeline-parallel-size 1 \
--kv-cache-dtype fp8 \
--trust-remote-code \
--block-size 256 \
--no-enable-prefix-caching \
"${EP_ARGS[@]}" \
"${GMU_ARGS[@]}" \
--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
--attention_config.use_fp4_indexer_cache True \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 \
--max-cudagraph-capture-size 2048 \
--max-model-len "$SERVE_MAX_MODEL_LEN" \
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &

Comment on lines +72 to +91
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The new vllm serve invocation does not pass --max-num-seqs, while nvidia-master.yaml schedules this config with conc-end: 4096 (1k1k DP-attn) and conc-end: 1024 (8k1k DP-attn). vLLM's per-replica default is 256, so even with DP=8 the engine caps in-flight requests at 8×256 = 2048 < 4096; the high-concurrency points will silently queue at the engine and report throughput/latency reflecting the cap rather than the requested concurrency. Suggest adding --max-num-seqs $CONC (or a high static value like 4096) to match the convention used in the sibling B200 vLLM recipes (gptoss_fp4_b200.sh, kimik2.5_fp4_b200.sh, kimik2.5_int4_b200.sh, dsv4_fp8_h200.sh).

Extended reasoning...

The bug: benchmarks/single_node/dsv4_fp4_b200_vllm.sh:69-85 builds the vllm serve command without passing --max-num-seqs. vLLM's default max_num_seqs is 256 per data-parallel replica. The matrix entry added to .github/configs/nvidia-master.yaml (dsv4-fp4-b200-vllm) schedules:

  • ISL=1024, DP-attn TP=8 (DP=8): conc-end: 4096
  • ISL=8192, DP-attn TP=8 (DP=8): conc-end: 1024

With DP=8 and per-replica default 256, the engine accepts at most 8×256 = 2048 concurrent sequences server-wide. So the CONC=4096 sweep point in the 1k1k DP-attn branch cannot actually be served at the requested concurrency — half the requests sit in the client-side or engine waiting queue while only ~2048 are processed in-flight.

Why this matters for the sweep: This is a benchmark recipe whose entire point is to populate a Pareto curve. At CONC=4096 (and likely the second-highest point too) the reported throughput and latency reflect the server cap, not the requested in-flight count, polluting the curve. The output looks plausible (no crash, no error), so the issue is silent — exactly the kind of regression the verifiers flagged as "normal" rather than "nit."

An internal contradiction in the script confirms intent: line 83 sets --max-cudagraph-capture-size 2048, indicating the author expects to capture CUDA graphs for batch sizes up to 2048. But with default --max-num-seqs 256, only batch sizes up to 256 are ever realized per replica, so the larger captured graphs are never exercised. This implies the author meant to lift the seq cap and just forgot.

Sibling recipes consistently set this: every other vLLM script in benchmarks/single_node/ that sweeps comparable concurrencies sets --max-num-seqs explicitly — gptoss_fp4_b200.sh:61 uses 512, kimik2.5_fp4_b200.sh:41 and kimik2.5_int4_b200.sh:41 use $CONC, dsv4_fp8_h200.sh:56 uses 512. The b300 sister script dsv4_fp4_b300_vllm.sh shares the omission, but its conc-end caps at 512 with TP=8/DP≤4 so the default 256-per-replica × DP≥2 is enough; the b200 sweep is the first one to extend past the implicit cap.

Step-by-step proof:

  1. CI launches dsv4-fp4-b200-vllm for the 1k1k DP-attn branch with CONC=4096, TP=8, DP_ATTENTION=true.
  2. PARALLEL_ARGS in dsv4_fp4_b200_vllm.sh:34-37 sets --tensor-parallel-size 1 --data-parallel-size 8. vLLM creates 8 replicas, each with the default max_num_seqs = 256. Total simultaneous in-flight cap: 8×256 = 2048.
  3. run_benchmark_serving (line 95) launches with --max-concurrency 4096 --num-prompts 40960. The client opens ~4096 in-flight requests and feeds them to vLLM.
  4. vLLM accepts ~2048 sequences, queues the rest. Throughput plateaus at the 2048-cap saturation, but the benchmark records this as the CONC=4096 data point.
  5. The Pareto plot then shows two adjacent points (e.g., CONC=2048 and CONC=4096) with effectively identical server-side behavior, distorting the high end of the curve.

Fix: Add --max-num-seqs "$CONC" to the vllm serve invocation (or a static cap ≥ 4096). Using $CONC follows the pattern in kimik2.5_fp4_b200.sh / kimik2.5_int4_b200.sh and ensures the engine never becomes the bottleneck for the configured sweep point. Alternatively, cap conc-end in the matrix at a value that fits the per-replica default × DP, but that loses sweep coverage and is the less attractive option given the matrix already specifies 4096.

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1898,3 +1898,12 @@
description:
- Add low-latency configs and remove non-pareto configs
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1193

- config-keys:
- dsv4-fp4-b200-vllm
description:
- "Add DeepSeek-V4-Pro single-node B200 vLLM benchmark derived from B200 pareto sweep"
- "ISL=1024: TP8 conc 4-128; DP8 (dp-attn) conc 256-4096"
- "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156

10 changes: 9 additions & 1 deletion runners/launch_b200-cw.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ export PORT=8888
MODEL_CODE="${EXP_NAME%%_*}"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
# Prefer a framework-tagged script (e.g. dsv4_fp4_b200_vllm.sh) so models
# with multiple inference engines can coexist; fall back to the historical
# name without an engine suffix (`_trt` for trt, bare for everyone else).
BENCH_BASE="benchmarks/single_node/${MODEL_CODE}_${PRECISION}_b200"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Double check this is back-compatible

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think this is back-compatible, it first checks whether dsv4_fp4_b200_vllm.sh exists, if not, it falls back to checking if file with the old naming scheme exists.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it even need to be backwards compatible ? @Oseltamivir

BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
if [[ ! -f "$BENCH_SCRIPT" ]]; then
BENCH_SCRIPT="${BENCH_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
fi

PARTITION="b200"
SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
Expand Down Expand Up @@ -58,6 +66,6 @@ srun --jobid=$JOB_ID \
--container-mount-home \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL \
bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
bash "$BENCH_SCRIPT"

scancel $JOB_ID
10 changes: 9 additions & 1 deletion runners/launch_b200-dgxc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,14 @@ else
SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
# Prefer a framework-tagged script (e.g. dsv4_fp4_b200_vllm.sh) so models
# with multiple inference engines can coexist; fall back to the historical
# name without an engine suffix (`_trt` for trt, bare for everyone else).
BENCH_BASE="benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200"
BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
if [[ ! -f "$BENCH_SCRIPT" ]]; then
BENCH_SCRIPT="${BENCH_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
fi
LOCK_FILE="${SQUASH_FILE}.lock"

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
Expand Down Expand Up @@ -290,5 +298,5 @@ else
--no-container-mount-home \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888 \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
bash "$BENCH_SCRIPT"
fi
10 changes: 9 additions & 1 deletion runners/launch_b200-nb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/"
PARTITION="main"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
# Prefer a framework-tagged script (e.g. dsv4_fp4_b200_vllm.sh) so models
# with multiple inference engines can coexist; fall back to the historical
# name without an engine suffix (`_trt` for trt, bare for everyone else).
BENCH_BASE="benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200"
BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
if [[ ! -f "$BENCH_SCRIPT" ]]; then
BENCH_SCRIPT="${BENCH_BASE}${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh"
fi

UCX_NET_DEVICES=eth0

Expand All @@ -27,4 +35,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME"
--container-writable \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
bash "$BENCH_SCRIPT"
Loading