Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2363,6 +2363,27 @@ dsr1-fp8-h200-sglang:
search-space:
- { tp: 8, conc-start: 4, conc-end: 64 }

# DeepSeek-V4-Pro H200 recipe from https://vllm.ai/blog/deepseek-v4
# Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache
# flag is omitted. Max-model-len is pinned at 800k per the recipe.
dsv4-fp8-h200-vllm:
image: vllm/vllm-openai:deepseekv4-cu129
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: h200
precision: fp8
framework: vllm
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 }

qwen3.5-fp8-h200-sglang:
image: lmsysorg/sglang:v0.5.9-cu129-amd64
model: Qwen/Qwen3.5-397B-A17B-FP8
Expand Down
93 changes: 93 additions & 0 deletions benchmarks/single_node/dsv4_fp8_h200.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env bash

# Per https://vllm.ai/blog/deepseek-v4 the DeepSeek-V4-Pro H200 recipe uses
# the cu129 image and omits the FP4 indexer cache flag (H200 has no FP4
# path). Max-model-len is pinned at 800k per the recipe.

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

# DeepSeek-V4-Pro weights are large; engine startup can exceed the default
# 600s. Give it an hour to load.
export VLLM_ENGINE_READY_TIMEOUT_S=3600

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN"
else
MAX_MODEL_LEN_ARG="--max-model-len 800000"
fi

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP
# from the search space is used only for GPU allocation by the runner and
# as the DP size.
set -x
vllm serve $MODEL --host 0.0.0.0 --port $PORT \
--trust-remote-code \
--kv-cache-dtype fp8 \
--block-size 256 \
--no-enable-prefix-caching \
--enable-expert-parallel \
--data-parallel-size $TP \
$MAX_MODEL_LEN_ARG \
--gpu-memory-utilization 0.95 \
--max-num-seqs 512 \
--max-num-batched-tokens 512 \
--no-enable-flashinfer-autotune \
--compilation-config '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' \
--tokenizer-mode deepseek_v4 \
--tool-call-parser deepseek_v4 \
--enable-auto-tool-choice \
--reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
12 changes: 12 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1733,3 +1733,15 @@
- "TP=2 and TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths"
- "Add --max-num-seqs and --gpu-memory-utilization 0.9 to server launch"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1043

- config-keys:
- dsv4-fp8-h200-vllm
description:
- "Add DeepSeek-V4-Pro vLLM H200 benchmark per https://vllm.ai/blog/deepseek-v4"
- "Image: vllm/vllm-openai:deepseekv4-cu129"
- "Model: deepseek-ai/DeepSeek-V4-Pro"
- "EP + DP=8, FP8 KV cache, block size 256, max-model-len 800000, prefix caching disabled"
- "H200 has no FP4 path, so --attention_config.use_fp4_indexer_cache is omitted"
- "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading"
- "Configs: 1k1k conc 4-64, 8k1k conc 4-64"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130
Loading