From 18e19681a72444bfeba9ee1711e32800f8d13e7a Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 11:18:02 +0800 Subject: [PATCH 1/3] sglang-update --- .github/configs/nvidia-master.yaml | 2 ++ benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 16 +++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7e975fdba..de2fc8ec3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1882,6 +1882,7 @@ dsv4-fp4-b300-sglang: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } - isl: 8192 osl: 1024 search-space: @@ -1890,6 +1891,7 @@ dsv4-fp4-b300-sglang: - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index d50b57d72..e5bc46706 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -31,6 +31,7 @@ export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 +export SGLANG_OPT_USE_ONLINE_COMPRESS=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -78,7 +79,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc # recipes first (they also have ep=8) so they aren't shadowed by the # medium-conc EP_SIZE=8 branch below. - if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then + if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then export NVSHMEM_DISABLE_IB=1 export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 @@ -91,13 +92,20 @@ if [ "${DP_ATTENTION}" = "true" ]; then MEM_FRACTION_STATIC=0.87 SWA_FULL_TOKENS_RATIO=0.06 TOKENIZER_WORKER_NUM=4 - else + elif [ "$CONC" = "4096" ]; then export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320 CUDA_GRAPH_MAX_BS=544 MAX_RUNNING_REQUESTS=4352 MEM_FRACTION_STATIC=0.835 SWA_FULL_TOKENS_RATIO=0.075 TOKENIZER_WORKER_NUM=8 + else + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 + CUDA_GRAPH_MAX_BS=1088 + MAX_RUNNING_REQUESTS=8192 + MEM_FRACTION_STATIC=0.80 + SWA_FULL_TOKENS_RATIO=0.3 + TOKENIZER_WORKER_NUM=16 fi PARALLEL_ARGS=( --dp-size "$TP" @@ -109,9 +117,6 @@ if [ "${DP_ATTENTION}" = "true" ]; then --tokenizer-worker-num "$TOKENIZER_WORKER_NUM" --enable-prefill-delayer ) - if [ "$CONC" = "4096" ]; then - PARALLEL_ARGS+=(--decode-log-interval 5) - fi elif [ "${EP_SIZE}" = "8" ]; then export NVSHMEM_DISABLE_IB=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 @@ -169,6 +174,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ + --stream-interval 30 \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! From 017a582c24a136f983b8f62e94c9f4461c32ff78 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 13:23:00 +0800 Subject: [PATCH 2/3] sglang-update --- .github/configs/nvidia-master.yaml | 1 - benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 9 +++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index de2fc8ec3..f13b8b6dd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1891,7 +1891,6 @@ dsv4-fp4-b300-sglang: - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index e5bc46706..8f43ea8a3 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -31,7 +31,6 @@ export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 -export SGLANG_OPT_USE_ONLINE_COMPRESS=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -100,6 +99,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then SWA_FULL_TOKENS_RATIO=0.075 TOKENIZER_WORKER_NUM=8 else + export SGLANG_OPT_USE_ONLINE_COMPRESS=1 export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256 CUDA_GRAPH_MAX_BS=1088 MAX_RUNNING_REQUESTS=8192 @@ -117,6 +117,12 @@ if [ "${DP_ATTENTION}" = "true" ]; then --tokenizer-worker-num "$TOKENIZER_WORKER_NUM" --enable-prefill-delayer ) + if [ "$CONC" = "4096" ]; then + PARALLEL_ARGS+=(--decode-log-interval 5) + fi + if [ "$CONC" = "8192" ]; then + PARALLEL_ARGS+=(--stream-interval 30) + fi elif [ "${EP_SIZE}" = "8" ]; then export NVSHMEM_DISABLE_IB=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 @@ -174,7 +180,6 @@ PYTHONNOUSERSITE=1 sglang serve \ --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ - --stream-interval 30 \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! From 16e03aea75c0e372c3415f1f99f350eefaab6451 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 29 Apr 2026 15:28:08 +0800 Subject: [PATCH 3/3] sglang-update --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2bd14e776..f74dbe770 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1985,3 +1985,10 @@ - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd" - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Add conc=8192 recipe for 1k1k: deepep mega_moe backend with cuda-graph-max-bs 1088, max-running-requests 8192, mem-fraction-static 0.80, swa-full-tokens-ratio 0.3, tokenizer-worker-num 16" + - "conc=8192 enables SGLANG_OPT_USE_ONLINE_COMPRESS=1 and --stream-interval 30" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1209