From 28b2d58c002495bc1635d26bf6173e3482ed8ae8 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 13:30:39 +0800 Subject: [PATCH 1/7] dsv4-b300-sglang: update points --- .github/configs/nvidia-master.yaml | 2 + .../single_node/dsv4_fp4_b300_sglang.sh | 51 ++++++++++++++----- perf-changelog.yaml | 6 +++ 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 39e299cb0..6fc02f0aa 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1882,6 +1882,8 @@ dsv4-fp4-b300-sglang: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=8) + - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } - isl: 8192 osl: 1024 search-space: diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index ac552c733..c68f38e24 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -71,23 +71,46 @@ MEM_FRACTION_STATIC=0.90 if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 export SGLANG_OPT_USE_FAST_MASK_EP=1 export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 16384 - --enable-prefill-delayer - ) - MEM_FRACTION_STATIC=0.94 + if [ "$CONC" = "8192" ]; then + # 1k1k high-concurrency mega_moe deepep recipe + export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8224 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --cuda-graph-max-bs 1056 + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 65536 + --tokenizer-worker-num 16 + --enable-prefill-delayer + --decode-log-interval 5 + ) + MAX_RUNNING_REQUESTS=8224 + MEM_FRACTION_STATIC=0.8 + SWA_FULL_TOKENS_RATIO=0.3 + else + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 16384 + --enable-prefill-delayer + ) + MEM_FRACTION_STATIC=0.94 + fi else PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 @@ -111,7 +134,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a29c278f2..f96e24b74 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1907,6 +1907,12 @@ - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156 +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "1k1k conc=8192: mega_moe deepep backend with cuda-graph-max-bs 1056, max-running-requests 8224, mem 0.8, swa-ratio 0.3, tokenizer-workers 16" + - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 entries" + - config-keys: - dsv4-fp4-b300-sglang-mtp description: From 0343982c7190d1a570b2578e73e7253abeaf2750 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 13:32:33 +0800 Subject: [PATCH 2/7] dsv4-b300-sglang: move changelog entry to end --- perf-changelog.yaml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f96e24b74..620496a08 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1907,12 +1907,6 @@ - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156 -- config-keys: - - dsv4-fp4-b300-sglang - description: - - "1k1k conc=8192: mega_moe deepep backend with cuda-graph-max-bs 1056, max-running-requests 8224, mem 0.8, swa-ratio 0.3, tokenizer-workers 16" - - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 entries" - - config-keys: - dsv4-fp4-b300-sglang-mtp description: @@ -1924,3 +1918,10 @@ - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "1k1k conc=8192: mega_moe deepep backend with cuda-graph-max-bs 1056, max-running-requests 8224, mem 0.8, swa-ratio 0.3, tokenizer-workers 16" + - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 entries" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1207 From 786b2af03dd8a8532850c2b267b86eb2b606b525 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 13:54:43 +0800 Subject: [PATCH 3/7] dsv4-b300-sglang: only test conc 8192 for 1k1k --- .github/configs/nvidia-master.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6fc02f0aa..6624c3220 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1879,9 +1879,10 @@ dsv4-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # --- only testing conc 8192 for now --- + # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=8) - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } - isl: 8192 From d16796388e51c43abaa1a0ec2d613f7ce2b56c0d Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 14:25:12 +0800 Subject: [PATCH 4/7] dsv4-b300-sglang: disable 8k1k configs --- .github/configs/nvidia-master.yaml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6624c3220..6c6c4bdb9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1885,12 +1885,13 @@ dsv4-fp4-b300-sglang: # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=8) - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # --- 8k1k temporarily disabled for focused 1k1k testing --- + # - isl: 8192 + # osl: 1024 + # search-space: + # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by From 8bf5fa462774b9f850310f781d01e96f89c3e906 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 15:06:58 +0800 Subject: [PATCH 5/7] dsv4-b300-sglang: set NVSHMEM_DISABLE_IB=1 for conc 8192 --- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index c68f38e24..f926ac732 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -77,6 +77,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 if [ "$CONC" = "8192" ]; then # 1k1k high-concurrency mega_moe deepep recipe + export NVSHMEM_DISABLE_IB=1 export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1 export SGLANG_LOG_FORWARD_ITERS=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 From 92a9d990a2944d68e092af77fd20111b40d70016 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 15:17:35 +0800 Subject: [PATCH 6/7] dsv4-b300-sglang: pin image to deepseek-v4-b300 index sha256:2922230d --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6c6c4bdb9..7580ec4fd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1860,7 +1860,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:2922230d92982cec72f4ead04fb1da2af5301bef48f223a822fa4cf9696b9fcd model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -1901,7 +1901,7 @@ dsv4-fp4-b300-sglang: # dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768 # + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 dsv4-fp4-b300-sglang-mtp: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:2922230d92982cec72f4ead04fb1da2af5301bef48f223a822fa4cf9696b9fcd model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From e88ebb25db5c398491d676334246d545a1d4cdec Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Tue, 28 Apr 2026 15:18:18 +0800 Subject: [PATCH 7/7] dsv4-b300-sglang: revert mtp image to previous digest --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7580ec4fd..19398bccc 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1901,7 +1901,7 @@ dsv4-fp4-b300-sglang: # dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768 # + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 dsv4-fp4-b300-sglang-mtp: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:2922230d92982cec72f4ead04fb1da2af5301bef48f223a822fa4cf9696b9fcd + image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300