SemiAnalysisAI · yhyang201 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -1860,7 +1860,7 @@ dsr1-fp8-b300-sglang:
 # until a B300-specific recipe ships. Prefix caching is disabled.
 # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:2922230d92982cec72f4ead04fb1da2af5301bef48f223a822fa4cf9696b9fcd
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -1879,15 +1879,19 @@ dsv4-fp4-b300-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    # --- only testing conc 8192 for now ---
+    # - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    # - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
+    # - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    # ep=8 is a naming convention for mega_moe deepep backend (actual ep=tp=8)
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
+  # --- 8k1k temporarily disabled for focused 1k1k testing ---
+  # - isl: 8192
+  #   osl: 1024
+  #   search-space:
+  #   - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+  #   - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
+  #   - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -71,23 +71,47 @@ MEM_FRACTION_STATIC=0.90
 
 if [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
     export SGLANG_OPT_USE_FAST_MASK_EP=1
     export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-runner-backend flashinfer_mxfp4
-        --disable-flashinfer-autotune
-        --deepep-config "$DEEPEP_CONFIG"
-        --chunked-prefill-size 16384
-        --enable-prefill-delayer
-    )
-    MEM_FRACTION_STATIC=0.94
+    if [ "$CONC" = "8192" ]; then
+        # 1k1k high-concurrency mega_moe deepep recipe
+        export NVSHMEM_DISABLE_IB=1
+        export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
+        export SGLANG_LOG_FORWARD_ITERS=1
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8224
+        PARALLEL_ARGS=(
+            --dp-size "$TP"
+            --enable-dp-attention
+            --moe-a2a-backend deepep
+            --cuda-graph-max-bs 1056
+            --deepep-config "$DEEPEP_CONFIG"
+            --chunked-prefill-size 65536
+            --tokenizer-worker-num 16
+            --enable-prefill-delayer
+            --decode-log-interval 5
+        )
+        MAX_RUNNING_REQUESTS=8224
+        MEM_FRACTION_STATIC=0.8
+        SWA_FULL_TOKENS_RATIO=0.3
+    else
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
+        PARALLEL_ARGS=(
+            --dp-size "$TP"
+            --enable-dp-attention
+            --moe-runner-backend flashinfer_mxfp4
+            --disable-flashinfer-autotune
+            --deepep-config "$DEEPEP_CONFIG"
+            --chunked-prefill-size 16384
+            --enable-prefill-delayer
+        )
+        MEM_FRACTION_STATIC=0.94
+    fi
 else
     PARALLEL_ARGS=(
         --moe-runner-backend flashinfer_mxfp4
@@ -111,7 +135,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
+    --max-running-requests "${MAX_RUNNING_REQUESTS:-$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))}" \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1928,3 +1928,12 @@
     - "Search space: TP=8, concurrency 4-64, 1k1k and 8k1k"
     - "MI355X runner updated to resolve framework-specific script names (dsv4_fp8_mi355x_vllm.sh) with fallback to generic names"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1188
+
+ - config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "1k1k conc=8192: mega_moe deepep backend with cuda-graph-max-bs 1056, max-running-requests 8224, mem 0.8, swa-ratio 0.3, tokenizer-workers 16"
+    - "ep=8 naming convention in yaml distinguishes mega_moe from existing flashinfer_mxfp4 entries"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1207
+
+