Skip to content
Merged
17 changes: 9 additions & 8 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1893,9 +1893,9 @@ dsv4-fp4-b300-sglang:
# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
# DP_ATTENTION:
# dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192
# dp-attn: true -> DP-attn + deepep mega_moe + chunked-prefill 32768
# `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
# while the TP-only path leaves ep_size at the default of 1.
# + EAGLE (3,1,4) + mem-fraction 0.90
# dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768
# + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256
dsv4-fp4-b300-sglang-mtp:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
model: deepseek-ai/DeepSeek-V4-Pro
Expand All @@ -1904,20 +1904,21 @@ dsv4-fp4-b300-sglang-mtp:
precision: fp4
framework: sglang
multinode: false
# Three CONC bands sweep with EAGLE/MTP (3/1/4) on top:
# A: TP=8 ep=1 -- conc 1-8 (latency-bound, full TP)
# B: TP=4 ep=1 -- conc 16-128 (TP-only, mid batch)
# C: TP=4 ep=4 dp-attn -- conc 64-512 (DP-attn + EP, large batch)
# Overlap: B/C at conc 64,128 (TP-only vs DP-attn EP head-to-head).
# Three CONC bands:
# A: TP=8 ep=1 -- conc 1-8 EAGLE (3,1,4) TP-only fallback
# B: TP=4 ep=1 -- conc 4-32 EAGLE (3,1,4) TP-only mid batch
# C: TP=4 ep=1 dp-attn -- conc 16-256 EAGLE (1,1,2) DP-attn flashinfer
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }

qwen3.5-bf16-b200-sglang:
image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
Expand Down
56 changes: 34 additions & 22 deletions benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ source "$(dirname "$0")/../benchmark_lib.sh"
# TP -- tensor parallel size -> --tp
# EP_SIZE -- expert parallel size -> --ep-size
# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP
# Also selects MoE backend / chunked-prefill-size:
# true -> deepep + mega_moe + chunked-prefill 32768
# false -> flashinfer_mxfp4 + chunked-prefill 8192
#
# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3,
# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands.
# Also selects MoE backend / chunked-prefill / EAGLE chain
# / mem-fraction-static / max-running-requests:
# true -> flashinfer_mxfp4 + DP-attn + chunked-prefill 32768
# + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256
# false -> flashinfer_mxfp4 (TP-only) + chunked-prefill 8192
# + EAGLE (3,1,4) + mem-fraction 0.90 + max-running CONC*3/2
check_env_vars \
MODEL \
TP \
Expand Down Expand Up @@ -63,40 +63,52 @@ fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow.
# Recipe path is selected by DP_ATTENTION; MoE backend, chunked-prefill, EAGLE
# chain, mem-fraction, and max-running all follow.
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

# MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe.
SPEC_FLAGS=(
--speculative-algorithm EAGLE
--speculative-num-steps 3
--speculative-eagle-topk 1
--speculative-num-draft-tokens 4
)

if [ "${DP_ATTENTION}" = "true" ]; then
# Large-batch EP path: deepep + mega_moe.
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
# DP-attn path: flashinfer_mxfp4 + DP-attn (covers conc 16-256).
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine for now since we are in early stages of v4 support, but eventually we'd like these to just be default settings based on scenario in engine

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
SPEC_FLAGS=(
--speculative-algorithm EAGLE
--speculative-num-steps 1
--speculative-eagle-topk 1
--speculative-num-draft-tokens 2
)
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--cuda-graph-max-bs 256
)
CHUNKED_PREFILL_SIZE=32768
MEM_FRACTION_STATIC=0.92
MAX_RUNNING_REQUESTS=256
else
# Small-batch TP-only path: flashinfer_mxfp4.
# TP-only fallback for low-conc: flashinfer_mxfp4 + EAGLE (3,1,4).
SPEC_FLAGS=(
--speculative-algorithm EAGLE
--speculative-num-steps 3
--speculative-eagle-topk 1
--speculative-num-draft-tokens 4
)
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
)
CHUNKED_PREFILL_SIZE=8192
MEM_FRACTION_STATIC=0.90
MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))"
fi

# Print all SGLANG_* env vars to both the CI step log and server.log so the
Expand All @@ -116,8 +128,8 @@ PYTHONNOUSERSITE=1 sglang serve \
--tp $TP \
--ep-size $EP_SIZE \
--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
--max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
--mem-fraction-static 0.90 \
--max-running-requests "$MAX_RUNNING_REQUESTS" \
--mem-fraction-static "$MEM_FRACTION_STATIC" \
--swa-full-tokens-ratio 0.1 \
"${SPEC_FLAGS[@]}" \
"${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
Expand Down
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1907,3 +1907,14 @@
- "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156

- config-keys:
- dsv4-fp4-b300-sglang-mtp
description:
- "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
- "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
- "Model: deepseek-ai/DeepSeek-V4-Pro"
- "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
- "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
- "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
- "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180
Loading