diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml index 1f1649d29..8fab04531 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml @@ -87,6 +87,7 @@ backend: SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" @@ -104,6 +105,7 @@ backend: SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml index d1f6aa2bf..e1f93b63d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml @@ -89,6 +89,7 @@ backend: SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" @@ -118,6 +119,7 @@ backend: SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml index 4d696ae35..80fb7534c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml @@ -89,6 +89,7 @@ backend: SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" @@ -118,6 +119,7 @@ backend: SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml index 72b8babf5..395d7f4d6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml @@ -89,6 +89,7 @@ backend: SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" @@ -118,6 +119,7 @@ backend: SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml index 526aa8636..5f164b7b4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml @@ -89,6 +89,7 @@ backend: SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" @@ -118,6 +119,7 @@ backend: SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml index 71cfa4bc3..fe24fa97f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml @@ -89,6 +89,7 @@ backend: SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" @@ -118,6 +119,7 @@ backend: SGLANG_OPT_USE_JIT_NORM: "1" SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_MULTI_STREAM_OVERLAP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_USE_FAST_MASK_EP: "1" diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index df1259deb..6c391e949 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -27,6 +27,7 @@ export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 +export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=1 # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 8f43ea8a3..1e0c24685 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -31,6 +31,7 @@ export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 +export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 03102778d..d0244e0ad 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -42,6 +42,7 @@ export SGLANG_OPT_USE_JIT_NORM=1 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 export SGLANG_OPT_USE_TOPK_V2=1 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 +export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2dfcda9fe..d651ecd0a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2063,3 +2063,14 @@ - "Recipes cover 8k/1k aggregate TP8 low-latency conc=1, low-latency bridge 1P DEP8 + 4D TP8 no-offload conc=16/32/64, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024" - "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242 + +- config-keys: + - dsv4-fp4-b200-sglang + - dsv4-fp4-b300-sglang + - dsv4-fp4-b300-sglang-mtp + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Add SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=1 to all SGLang DeepSeek-V4 launch configurations" + - "Single-node: dsv4_fp4_b200.sh, dsv4_fp4_b300_sglang.sh, dsv4_fp4_b300_sglang_mtp.sh" + - "Multi-node: conc1, conc512, conc512-20, conc1024, conc2048, conc16384 (both prefill and decode environments)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1246