Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1670,7 +1670,7 @@ dsr1-fp4-b200-sglang:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }

dsv4-fp4-b200-sglang:
image: lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b
image: lmsysorg/sglang:deepseek-v4-blackwell
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
Expand All @@ -1690,7 +1690,7 @@ dsv4-fp4-b200-sglang:
osl: 1024
search-space:
# low-latency
- { tp: 8, ep: 1, conc-start: 1, conc-end: 32 }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
# balanced
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
# max-throughput
Expand All @@ -1699,7 +1699,7 @@ dsv4-fp4-b200-sglang:
osl: 1024
search-space:
# low-latency
- { tp: 8, ep: 1, conc-start: 1, conc-end: 32 }
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
# balanced
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
# max-throughput
Expand Down
71 changes: 34 additions & 37 deletions benchmarks/single_node/dsv4_fp4_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ source "$(dirname "$0")/../benchmark_lib.sh"
check_env_vars \
MODEL \
TP \
DP_ATTENTION \
CONC \
ISL \
OSL \
Expand All @@ -20,13 +19,7 @@ hf download "$MODEL"

nvidia-smi

# Common SGLANG env vars (apply to every config).
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
export SGLANG_OPT_USE_JIT_NORM=1
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
export SGLANG_OPT_USE_TOPK_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1

# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
Expand All @@ -37,7 +30,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -47,41 +40,47 @@ fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
# script's pattern). DP-attention turns on EP-MoE (deepep) and the related
# mega_moe optimizations; single-instance uses flashinfer_mxfp4.
# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune
# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128
# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

if [ "${DP_ATTENTION}" = "true" ]; then
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
PARALLEL_ARGS=(
if [[ $CONC -le 32 ]]; then
RECIPE=low-latency
RECIPE_FLAGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 4096
--disable-flashinfer-autotune
--mem-fraction-static 0.82
)
elif [[ $CONC -le 128 ]]; then
RECIPE=balanced
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
RECIPE_FLAGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 32768
--mem-fraction-static 0.82
--cuda-graph-max-bs 64
--max-running-requests 128
)
else
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
RECIPE=max-throughput
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
RECIPE_FLAGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--deepep-config "$DEEPEP_CONFIG"
--mem-fraction-static 0.82
--cuda-graph-max-bs 64
--max-running-requests 256
)
fi

# Print all SGLANG_* env vars to both the CI step log and server.log so the
# launch config is auditable from the result artifact alone.
{
echo "=== SGLANG_* env vars at launch ==="
env | grep -E '^SGLANG_' | sort
echo "==================================="
} | tee "$SERVER_LOG"
echo "Recipe: $RECIPE (CONC=$CONC)"

set -x
PYTHONNOUSERSITE=1 sglang serve \
Expand All @@ -90,10 +89,8 @@ PYTHONNOUSERSITE=1 sglang serve \
--port $PORT \
--trust-remote-code \
--tp $TP \
--max-running-requests "$((CONC * 3 / 2))" \
--mem-fraction-static 0.90 \
--swa-full-tokens-ratio 0.1 \
"${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
--disable-radix-cache \
"${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
Loading