From 18e19681a72444bfeba9ee1711e32800f8d13e7a Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 29 Apr 2026 11:18:02 +0800
Subject: [PATCH 1/3] sglang-update

---
 .github/configs/nvidia-master.yaml             |  2 ++
 benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 7e975fdba..de2fc8ec3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1882,6 +1882,7 @@ dsv4-fp4-b300-sglang:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -1890,6 +1891,7 @@ dsv4-fp4-b300-sglang:
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index d50b57d72..e5bc46706 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -31,6 +31,7 @@ export SGLANG_OPT_USE_JIT_NORM=1
 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
 export SGLANG_OPT_USE_TOPK_V2=1
 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
+export SGLANG_OPT_USE_ONLINE_COMPRESS=1
 
 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at
 # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
@@ -78,7 +79,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     # ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
     # recipes first (they also have ep=8) so they aren't shadowed by the
     # medium-conc EP_SIZE=8 branch below.
-    if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ]; then
+    if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then
         export NVSHMEM_DISABLE_IB=1
         export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
@@ -91,13 +92,20 @@ if [ "${DP_ATTENTION}" = "true" ]; then
             MEM_FRACTION_STATIC=0.87
             SWA_FULL_TOKENS_RATIO=0.06
             TOKENIZER_WORKER_NUM=4
-        else
+        elif [ "$CONC" = "4096" ]; then
             export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
             CUDA_GRAPH_MAX_BS=544
             MAX_RUNNING_REQUESTS=4352
             MEM_FRACTION_STATIC=0.835
             SWA_FULL_TOKENS_RATIO=0.075
             TOKENIZER_WORKER_NUM=8
+        else
+            export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
+            CUDA_GRAPH_MAX_BS=1088
+            MAX_RUNNING_REQUESTS=8192
+            MEM_FRACTION_STATIC=0.80
+            SWA_FULL_TOKENS_RATIO=0.3
+            TOKENIZER_WORKER_NUM=16
         fi
         PARALLEL_ARGS=(
             --dp-size "$TP"
@@ -109,9 +117,6 @@ if [ "${DP_ATTENTION}" = "true" ]; then
             --tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
             --enable-prefill-delayer
         )
-        if [ "$CONC" = "4096" ]; then
-            PARALLEL_ARGS+=(--decode-log-interval 5)
-        fi
     elif [ "${EP_SIZE}" = "8" ]; then
         export NVSHMEM_DISABLE_IB=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
@@ -169,6 +174,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
+    --stream-interval 30 \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From 017a582c24a136f983b8f62e94c9f4461c32ff78 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 29 Apr 2026 13:23:00 +0800
Subject: [PATCH 2/3] sglang-update

---
 .github/configs/nvidia-master.yaml             | 1 -
 benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 9 +++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index de2fc8ec3..f13b8b6dd 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1891,7 +1891,6 @@ dsv4-fp4-b300-sglang:
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index e5bc46706..8f43ea8a3 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -31,7 +31,6 @@ export SGLANG_OPT_USE_JIT_NORM=1
 export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
 export SGLANG_OPT_USE_TOPK_V2=1
 export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
-export SGLANG_OPT_USE_ONLINE_COMPRESS=1
 
 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at
 # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
@@ -100,6 +99,7 @@ if [ "${DP_ATTENTION}" = "true" ]; then
             SWA_FULL_TOKENS_RATIO=0.075
             TOKENIZER_WORKER_NUM=8
         else
+            export SGLANG_OPT_USE_ONLINE_COMPRESS=1
             export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
             CUDA_GRAPH_MAX_BS=1088
             MAX_RUNNING_REQUESTS=8192
@@ -117,6 +117,12 @@ if [ "${DP_ATTENTION}" = "true" ]; then
             --tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
             --enable-prefill-delayer
         )
+        if [ "$CONC" = "4096" ]; then
+            PARALLEL_ARGS+=(--decode-log-interval 5)
+        fi
+        if [ "$CONC" = "8192" ]; then
+            PARALLEL_ARGS+=(--stream-interval 30)
+        fi
     elif [ "${EP_SIZE}" = "8" ]; then
         export NVSHMEM_DISABLE_IB=1
         export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
@@ -174,7 +180,6 @@ PYTHONNOUSERSITE=1 sglang serve \
     --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
-    --stream-interval 30 \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

From 16e03aea75c0e372c3415f1f99f350eefaab6451 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Wed, 29 Apr 2026 15:28:08 +0800
Subject: [PATCH 3/3] sglang-update

---
 perf-changelog.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2bd14e776..f74dbe770 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1985,3 +1985,10 @@
     - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd"
     - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Add conc=8192 recipe for 1k1k: deepep mega_moe backend with cuda-graph-max-bs 1088, max-running-requests 8192, mem-fraction-static 0.80, swa-full-tokens-ratio 0.3, tokenizer-worker-num 16"
+    - "conc=8192 enables SGLANG_OPT_USE_ONLINE_COMPRESS=1 and --stream-interval 30"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1209