From 74e99f185fa78731a634a8a44649fea9156a1d04 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Fri, 24 Apr 2026 15:07:57 -0700 Subject: [PATCH 1/5] feat: add deepseek-v4-pro b300 vllm benchmark --- .github/configs/nvidia-master.yaml | 28 +++++++ benchmarks/single_node/dsv4_fp8_b300.sh | 104 ++++++++++++++++++++++++ perf-changelog.yaml | 13 ++- 3 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 benchmarks/single_node/dsv4_fp8_b300.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a64803497..5335720de 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2448,6 +2448,34 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } +# DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 +# pareto sweep. The single-node schema has no explicit data-parallel-size +# field, so dp-attn=true is used as the existing vLLM script switch for DP4 +# layouts on 4 allocated GPUs. +dsv4-fp8-b300-vllm: + image: vllm/vllm-openai:deepseekv4-cu129 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp8 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 4 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 128, conc-end: 128 } + - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 4 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 128, conc-end: 128 } + - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 } + qwen3.5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.9-cu129-amd64 model: Qwen/Qwen3.5-397B-A17B-FP8 diff --git a/benchmarks/single_node/dsv4_fp8_b300.sh b/benchmarks/single_node/dsv4_fp8_b300.sh new file mode 100644 index 000000000..52f38c4d9 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp8_b300.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash + +# DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 +# pareto sweep. The matrix uses dp-attn=true as the existing switch to flip a +# 4-GPU run from TP4 to DP4. Expert parallel is always enabled to match the +# provided vllm serve command exactly. + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + DP_ATTENTION \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +# DeepSeek-V4-Pro weights are large; engine startup can exceed the default +# 600s. Give it an hour to load. +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +PARALLEL_ARGS=(--tensor-parallel-size "$TP" --data-parallel-size 1) +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP") +fi + +BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN" +if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then + BENCHMARK_MAX_MODEL_LEN=4096 +fi + +if [ "${EVAL_ONLY}" = "true" ]; then + EVAL_MAX_MODEL_LEN=$(compute_eval_context_length "$MODEL" "$BENCHMARK_MAX_MODEL_LEN") + export EVAL_MAX_MODEL_LEN + SERVE_MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" +else + SERVE_MAX_MODEL_LEN="$BENCHMARK_MAX_MODEL_LEN" +fi + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +set -x +vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ + "${PARALLEL_ARGS[@]}" \ + --pipeline-parallel-size 1 \ + --kv-cache-dtype fp8 \ + --trust-remote-code \ + --block-size 256 \ + --no-enable-prefix-caching \ + --enable-expert-parallel \ + --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \ + --attention_config.use_fp4_indexer_cache True \ + --tokenizer-mode deepseek_v4 \ + --tool-call-parser deepseek_v4 \ + --enable-auto-tool-choice \ + --reasoning-parser deepseek_v4 \ + --max-cudagraph-capture-size 2048 \ + --max-model-len "$SERVE_MAX_MODEL_LEN" \ + --max-num-batched-tokens 2048 > "$SERVER_LOG" 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c6ad279d5..9a5bcd245 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1755,7 +1755,7 @@ - "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading" - "Configs: 1k1k conc 4-64, 8k1k conc 4-64" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130 - + - config-keys: - dsv4-fp4-b300-sglang description: @@ -1766,3 +1766,14 @@ - "Prefix caching disabled, no speculative decoding" - "Configs: 1k1k conc 4-1024, 8k1k conc 4-512" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143 + +- config-keys: + - dsv4-fp8-b300-vllm + description: + - "Add DeepSeek-V4-Pro single-node B300 vLLM aggregate benchmark" + - "Image: vllm/vllm-openai:deepseekv4-cu129" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512" + - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048" + - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD From 3eebd566e66e2f1cb12ba6f7c22845e59f735c5a Mon Sep 17 00:00:00 2001 From: Ankur Singh Date: Fri, 24 Apr 2026 15:11:20 -0700 Subject: [PATCH 2/5] Update PR link in perf-changelog.yaml --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9a5bcd245..dee58decb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1776,4 +1776,4 @@ - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512" - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048" - "1k1k uses --max-model-len 4096; 8k1k uses the workflow-provided benchmark context length" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1144 From cdef7c906d4d76ef7bd751ec46d0c5ddad1073c4 Mon Sep 17 00:00:00 2001 From: Ankur Singh Date: Fri, 24 Apr 2026 15:14:31 -0700 Subject: [PATCH 3/5] Change precision from fp8 to fp4 in nvidia-master.yaml fix precision --- .github/configs/nvidia-master.yaml | 4 ++-- .../single_node/{dsv4_fp8_b300.sh => dsv4_fp4_b300_vllm.sh} | 0 perf-changelog.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename benchmarks/single_node/{dsv4_fp8_b300.sh => dsv4_fp4_b300_vllm.sh} (100%) mode change 100644 => 100755 diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5335720de..eb16cba8a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2452,12 +2452,12 @@ dsv4-fp8-h200-vllm: # pareto sweep. The single-node schema has no explicit data-parallel-size # field, so dp-attn=true is used as the existing vLLM script switch for DP4 # layouts on 4 allocated GPUs. -dsv4-fp8-b300-vllm: +dsv4-fp4-b300-vllm: image: vllm/vllm-openai:deepseekv4-cu129 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 - precision: fp8 + precision: fp4 framework: vllm multinode: false seq-len-configs: diff --git a/benchmarks/single_node/dsv4_fp8_b300.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh old mode 100644 new mode 100755 similarity index 100% rename from benchmarks/single_node/dsv4_fp8_b300.sh rename to benchmarks/single_node/dsv4_fp4_b300_vllm.sh diff --git a/perf-changelog.yaml b/perf-changelog.yaml index dee58decb..c51544602 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1768,7 +1768,7 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143 - config-keys: - - dsv4-fp8-b300-vllm + - dsv4-fp4-b300-vllm description: - "Add DeepSeek-V4-Pro single-node B300 vLLM aggregate benchmark" - "Image: vllm/vllm-openai:deepseekv4-cu129" From 59eef30824b120ad6c34d898b4612c8b96c29c07 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Fri, 24 Apr 2026 15:43:36 -0700 Subject: [PATCH 4/5] Bump deepseekv4 vLLM image to cu130 --- .github/configs/nvidia-master.yaml | 2 +- perf-changelog.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index eb16cba8a..cccde0bcc 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2453,7 +2453,7 @@ dsv4-fp8-h200-vllm: # field, so dp-attn=true is used as the existing vLLM script switch for DP4 # layouts on 4 allocated GPUs. dsv4-fp4-b300-vllm: - image: vllm/vllm-openai:deepseekv4-cu129 + image: vllm/vllm-openai:deepseekv4-cu130 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c51544602..c2e38104b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1771,7 +1771,7 @@ - dsv4-fp4-b300-vllm description: - "Add DeepSeek-V4-Pro single-node B300 vLLM aggregate benchmark" - - "Image: vllm/vllm-openai:deepseekv4-cu129" + - "Image: vllm/vllm-openai:deepseekv4-cu130" - "Model: deepseek-ai/DeepSeek-V4-Pro" - "Uses the submitted B300 pareto schedule for both 1k1k and 8k1k, excluding conc 1: TP8 at conc 4/128, TP4 at conc 4/8/16/32/64/128, DP4 at conc 256/512" - "Launch args match the provided vllm serve command, including FP4 indexer cache, FULL_AND_PIECEWISE cudagraph config, and max-num-batched-tokens 2048" From 6a0fa73c068a4eef3547f5242f7191f90e066213 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Fri, 24 Apr 2026 15:54:20 -0700 Subject: [PATCH 5/5] B300 launcher: framework suffix for dsv4 scripts Use _${FRAMEWORK} suffix for dsv4 model on B300 so vllm and sglang configs resolve to distinct benchmark scripts. Other models keep the existing _trt-only logic. Rename dsv4_fp4_b300.sh to dsv4_fp4_b300_sglang.sh to match the new convention. --- .../{dsv4_fp4_b300.sh => dsv4_fp4_b300_sglang.sh} | 0 runners/launch_b300-nv.sh | 6 +++++- 2 files changed, 5 insertions(+), 1 deletion(-) rename benchmarks/single_node/{dsv4_fp4_b300.sh => dsv4_fp4_b300_sglang.sh} (100%) diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh similarity index 100% rename from benchmarks/single_node/dsv4_fp4_b300.sh rename to benchmarks/single_node/dsv4_fp4_b300_sglang.sh diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 3daac0167..1c4c22445 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -259,7 +259,11 @@ else export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro" fi SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + if [[ "$MODEL_PREFIX" == "dsv4" ]]; then + FRAMEWORK_SUFFIX="_${FRAMEWORK}" + else + FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') + fi SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock"