Skip to content
12 changes: 5 additions & 7 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2501,17 +2501,15 @@ dsv4-fp4-b300-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 4 }
- { tp: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, conc-start: 128, conc-end: 128 }
- { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 4096 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 8192 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 4, conc-end: 4 }
- { tp: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, conc-start: 128, conc-end: 128 }
- { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 1024 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 4096 }

qwen3.5-fp8-h200-sglang:
image: lmsysorg/sglang:v0.5.9-cu129-amd64
Comment thread
wzhao18 marked this conversation as resolved.
Expand Down
20 changes: 15 additions & 5 deletions benchmarks/single_node/dsv4_fp4_b300_vllm.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#!/usr/bin/env bash

# DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300
# pareto sweep. The matrix uses dp-attn=true as the existing switch to flip a
# 4-GPU run from TP4 to DP4. Expert parallel is always enabled to match the
# provided vllm serve command exactly.
# pareto sweep. TP mode (dp-attn=false) runs without expert parallel; DP mode
# (dp-attn=true) enables expert parallel (EP_SIZE=TP value = DP size).

source "$(dirname "$0")/../benchmark_lib.sh"

Expand Down Expand Up @@ -38,6 +37,17 @@ if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
fi

EP_ARGS=()
if [ "${EP_SIZE:-1}" -gt 1 ]; then
EP_ARGS=(--enable-expert-parallel)
fi

if [ "${DP_ATTENTION}" = "true" ]; then
MAX_NUM_BATCHED_TOKENS=2048
else
MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 ))
fi

BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then
BENCHMARK_MAX_MODEL_LEN=4096
Expand All @@ -62,7 +72,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
--trust-remote-code \
--block-size 256 \
--no-enable-prefix-caching \
--enable-expert-parallel \
"${EP_ARGS[@]}" \
--compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
--attention_config.use_fp4_indexer_cache True \
--tokenizer-mode deepseek_v4 \
Expand All @@ -71,7 +81,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
--reasoning-parser deepseek_v4 \
--max-cudagraph-capture-size 2048 \
--max-model-len "$SERVE_MAX_MODEL_LEN" \
--max-num-batched-tokens 2048 > "$SERVER_LOG" 2>&1 &
--max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &

SERVER_PID=$!

Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1894,3 +1894,12 @@
- "better performance for dp-attention"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178

- config-keys:
- dsv4-fp4-b300-vllm
description:
- "Update search space based on B300 pareto sweep results"
- "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192"
- "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155

Loading