Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1860,7 +1860,7 @@ dsr1-fp8-b300-sglang:
# until a B300-specific recipe ships. Prefix caching is disabled.
# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
image: lmsysorg/sglang:deepseek-v4-b300@sha256:2922230d92982cec72f4ead04fb1da2af5301bef48f223a822fa4cf9696b9fcd
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
Expand All @@ -1879,15 +1879,19 @@ dsv4-fp4-b300-sglang:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
# --- only testing conc 8192 for now ---
# - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
# - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
# - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
# ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=8)
- { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
# --- 8k1k temporarily disabled for focused 1k1k testing ---
# - isl: 8192
# osl: 1024
# search-space:
# - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
# - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
# - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }

# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
Expand Down
52 changes: 38 additions & 14 deletions benchmarks/single_node/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,23 +71,47 @@ MEM_FRACTION_STATIC=0.90

if [ "${DP_ATTENTION}" = "true" ]; then
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MEM_FRACTION_STATIC=0.94
if [ "$CONC" = "8192" ]; then
# 1k1k high-concurrency mega_moe deepep recipe
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_LOG_FORWARD_ITERS=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8224
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--cuda-graph-max-bs 1056
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 65536
--tokenizer-worker-num 16
--enable-prefill-delayer
--decode-log-interval 5
)
MAX_RUNNING_REQUESTS=8224
MEM_FRACTION_STATIC=0.8
SWA_FULL_TOKENS_RATIO=0.3
else
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MEM_FRACTION_STATIC=0.94
fi
else
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
Expand All @@ -111,7 +135,7 @@ PYTHONNOUSERSITE=1 sglang serve \
--port $PORT \
--trust-remote-code \
--tp $TP \
--max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
--max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
--mem-fraction-static "$MEM_FRACTION_STATIC" \
--swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
"${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1928,3 +1928,12 @@
- "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k"
- "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "1k1k conc=8192: mega_moe deepep backend with cuda-graph-max-bs 1056, max-running-requests 8224, mem 0.8, swa-ratio 0.3, tokenizer-workers 16"
- "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 entries"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1207