Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,7 @@ dsv4-fp4-b300-sglang:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
- isl: 8192
osl: 1024
search-space:
Expand Down
15 changes: 13 additions & 2 deletions benchmarks/single_node/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
# ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
# recipes first (they also have ep=8) so they aren't shadowed by the
# medium-conc EP_SIZE=8 branch below.
if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
Expand All @@ -91,13 +91,21 @@ if [ "${DP_ATTENTION}" = "true" ]; then
MEM_FRACTION_STATIC=0.87
SWA_FULL_TOKENS_RATIO=0.06
TOKENIZER_WORKER_NUM=4
else
elif [ "$CONC" = "4096" ]; then
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
CUDA_GRAPH_MAX_BS=544
MAX_RUNNING_REQUESTS=4352
MEM_FRACTION_STATIC=0.835
SWA_FULL_TOKENS_RATIO=0.075
TOKENIZER_WORKER_NUM=8
else
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
CUDA_GRAPH_MAX_BS=1088
MAX_RUNNING_REQUESTS=8192
MEM_FRACTION_STATIC=0.80
SWA_FULL_TOKENS_RATIO=0.3
TOKENIZER_WORKER_NUM=16
fi
PARALLEL_ARGS=(
--dp-size "$TP"
Expand All @@ -112,6 +120,9 @@ if [ "${DP_ATTENTION}" = "true" ]; then
if [ "$CONC" = "4096" ]; then
PARALLEL_ARGS+=(--decode-log-interval 5)
fi
if [ "$CONC" = "8192" ]; then
PARALLEL_ARGS+=(--stream-interval 30)
fi
elif [ "${EP_SIZE}" = "8" ]; then
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1985,3 +1985,10 @@
- "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd"
- "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Add conc=8192 recipe for 1k1k: deepep mega_moe backend with cuda-graph-max-bs 1088, max-running-requests 8192, mem-fraction-static 0.80, swa-full-tokens-ratio 0.3, tokenizer-worker-num 16"
- "conc=8192 enables SGLANG_OPT_USE_ONLINE_COMPRESS=1 and --stream-interval 30"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1209
Loading