Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2409,6 +2409,27 @@ dsr1-fp8-b200-trt-mtp:
# TP8 for all points
- { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }

# DeepSeek-V4-Pro recipe from https://vllm.ai/blog/deepseek-v4
# Runs on 8xB200 with DP=8 + expert parallelism (TP=1 per replica),
# FP8 KV cache, block size 256, and FP4 indexer cache.
dsv4-fp4-b200-vllm:
image: vllm/vllm-openai:deepseekv4-cu130
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200
precision: fp4
framework: vllm
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 }

dsr1-fp8-h200-sglang:
image: lmsysorg/sglang:v0.5.9-cu130
model: deepseek-ai/DeepSeek-R1-0528
Expand Down
Empty file modified benchmarks/single_node/dsv4_fp4_b200.sh
100755 → 100644
Empty file.
82 changes: 82 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b200_vllm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

# Per https://vllm.ai/blog/deepseek-v4 the DeepSeek-V4-Pro recipe on 8xB200
# runs with EP + DP=8 (no --tensor-parallel-size flag). TP from the search
# space is used only for GPU allocation by the runner and as the DP size.
set -x
VLLM_ENGINE_READY_TIMEOUT_S=3600 \
vllm serve $MODEL --host 0.0.0.0 --port $PORT \
--trust-remote-code \
--kv-cache-dtype fp8 \
--block-size 256 \
--no-enable-prefix-caching \
--enable-expert-parallel \
--data-parallel-size $TP \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
--attention_config.use_fp4_indexer_cache=True \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
16 changes: 15 additions & 1 deletion perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1755,7 +1755,7 @@
- "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading"
- "Configs: 1k1k conc 4-64, 8k1k conc 4-64"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130

- config-keys:
- dsv4-fp4-b300-sglang
description:
Expand Down Expand Up @@ -1812,3 +1812,17 @@
- "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern"
- "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129

- config-keys:
- dsv4-fp4-b200-vllm
description:
- "Add DeepSeek-V4-Pro vLLM B200 benchmark per https://vllm.ai/blog/deepseek-v4"
- "Image: vllm/vllm-openai:deepseekv4-cu130"
- "Model: deepseek-ai/DeepSeek-V4-Pro"
- "8xB200 recipe: DP=8 + expert parallelism (TP=1/replica), FP8 KV cache, block size 256, FP4 indexer cache"
- "Flags: --trust-remote-code, --kv-cache-dtype fp8, --block-size 256, --enable-expert-parallel,"
- " --data-parallel-size 8, --compilation-config cudagraph_mode=FULL_AND_PIECEWISE custom_ops=all,"
- " --attention_config.use_fp4_indexer_cache=True, --tokenizer-mode deepseek_v4,"
- " --tool-call-parser deepseek_v4, --enable-auto-tool-choice, --reasoning-parser deepseek_v4"
- "Configs: 1k1k conc 4-1024, 8k1k conc 4-512"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
14 changes: 12 additions & 2 deletions runners/launch_b200-cw.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,19 @@ export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache"
export PORT=8888

MODEL_CODE="${EXP_NAME%%_*}"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')

# Prefer a framework-tagged script (e.g. dsv4_fp4_b200_vllm.sh) so models
# with multiple inference engines can coexist; fall back to the historical
# name without an engine suffix (`_trt` for trt, bare for everyone else)
# for scripts that haven't been retagged yet.
BENCH_BASE="benchmarks/single_node/${MODEL_CODE}_${PRECISION}_b200"
BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
if [[ ! -f "$BENCH_SCRIPT" ]]; then
LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh"
fi

PARTITION="b200"
SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"
Expand Down Expand Up @@ -58,6 +68,6 @@ srun --jobid=$JOB_ID \
--container-mount-home \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL \
bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
bash "$BENCH_SCRIPT"

scancel $JOB_ID
13 changes: 11 additions & 2 deletions runners/launch_b200-dgxc-slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,17 @@ else

HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
# Prefer a framework-tagged script (e.g. dsv4_fp4_b200_vllm.sh) so models
# with multiple inference engines can coexist; fall back to the historical
# name without an engine suffix (`_trt` for trt, bare for everyone else)
# for scripts that haven't been retagged yet.
BENCH_BASE="benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200"
BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
if [[ ! -f "$BENCH_SCRIPT" ]]; then
LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh"
fi
LOCK_FILE="${SQUASH_FILE}.lock"

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
Expand Down Expand Up @@ -290,5 +299,5 @@ else
--no-container-mount-home \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888 \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
bash "$BENCH_SCRIPT"
fi
14 changes: 12 additions & 2 deletions runners/launch_b200-nb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,19 @@

HF_HUB_CACHE_MOUNT="/mnt/data/gharunners/hf-hub-cache/"
PARTITION="main"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')

# Prefer a framework-tagged script (e.g. dsv4_fp4_b200_vllm.sh) so models
# with multiple inference engines can coexist; fall back to the historical
# name without an engine suffix (`_trt` for trt, bare for everyone else)
# for scripts that haven't been retagged yet.
BENCH_BASE="benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200"
BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"
if [[ ! -f "$BENCH_SCRIPT" ]]; then
LEGACY_FW_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
BENCH_SCRIPT="${BENCH_BASE}${LEGACY_FW_SUFFIX}${SPEC_SUFFIX}.sh"
fi

UCX_NET_DEVICES=eth0

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
Expand All @@ -27,4 +37,4 @@ srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME"
--container-writable \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
bash "$BENCH_SCRIPT"
Loading