From f2dda86f26db44f887abf695baf39e99707f0958 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 00:54:12 +0800
Subject: [PATCH 1/8] dsv4-b300-sglang-mtp: restore TP4 and DP-attn
 search-space entries

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3a7ba3df1..673540f14 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1892,10 +1892,14 @@ dsv4-fp4-b300-sglang-mtp:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e

From 4190a9495fed5ba8b36b60981a19595965994326 Mon Sep 17 00:00:00 2001
From: Yuhao Yang <47235274+yhyang201@users.noreply.github.com>
Date: Mon, 27 Apr 2026 01:34:26 +0800
Subject: [PATCH 2/8] Update nvidia-master.yaml

---
 .github/configs/nvidia-master.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 673540f14..8edd6353e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1893,13 +1893,11 @@ dsv4-fp4-b300-sglang-mtp:
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e

From dbde9fa4ddde4bcc6ef856d20233f9c1b88f6021 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Mon, 27 Apr 2026 01:39:29 +0800
Subject: [PATCH 3/8] dsv4-b300-sglang-mtp: add perf-changelog entry for PR
 #1180

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e4c46268e..fc951283e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1894,3 +1894,15 @@
     - "better performance for dp-attention"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178
+
+- config-keys:
+    - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
+    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
+    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
+    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180

From b8a625d83e7fd491a1be23bc744b4c3c0a6c6d34 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sun, 26 Apr 2026 22:16:51 -0700
Subject: [PATCH 4/8] add dp-attn band; flashinfer + (1,1,2) chain

---
 .github/configs/nvidia-master.yaml            | 18 +++---
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh   | 57 ++++++++++++-------
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a15abfcd8..b523c6d2c 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1872,9 +1872,9 @@ dsv4-fp4-b300-sglang:
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
 # DP_ATTENTION:
 #   dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192
-#   dp-attn: true  -> DP-attn + deepep mega_moe + chunked-prefill 32768
-# `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
-# while the TP-only path leaves ep_size at the default of 1.
+#                     + EAGLE (3,1,4) + mem-fraction 0.90
+#   dp-attn: true  -> DP-attn  + flashinfer_mxfp4 + chunked-prefill 32768
+#                     + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256
 dsv4-fp4-b300-sglang-mtp:
   image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1883,22 +1883,24 @@ dsv4-fp4-b300-sglang-mtp:
   precision: fp4
   framework: sglang
   multinode: false
-  # Three CONC bands sweep with EAGLE/MTP (3/1/4) on top:
-  #   A: TP=8 ep=1            -- conc 1-8    (latency-bound, full TP)
-  #   B: TP=4 ep=1            -- conc 16-128 (TP-only, mid batch)
-  #   C: TP=4 ep=4 dp-attn    -- conc 64-512 (DP-attn + EP, large batch)
-  # Overlap: B/C at conc 64,128 (TP-only vs DP-attn EP head-to-head).
+  # Three CONC bands:
+  #   A: TP=8 ep=1            -- conc 1-8    EAGLE (3,1,4) TP-only fallback
+  #   B: TP=4 ep=1            -- conc 16-128 EAGLE (3,1,4) TP-only mid batch
+  #   C: TP=4 ep=1 dp-attn    -- conc 16-256 EAGLE (1,1,2) DP-attn flashinfer
+  # Overlap: B/C at conc 16-128 (TP-only vs DP-attn head-to-head).
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
+    - { tp: 4, ep: 1, dp-attn: true, conc-start: 16, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
+    - { tp: 4, ep: 1, dp-attn: true, conc-start: 16, conc-end: 256, spec-decoding: mtp }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index d01f80a1d..b5f920c6c 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -6,12 +6,12 @@ source "$(dirname "$0")/../benchmark_lib.sh"
 #   TP            -- tensor parallel size                       -> --tp
 #   EP_SIZE       -- expert parallel size                       -> --ep-size
 #   DP_ATTENTION  -- "true" enables --enable-dp-attention --dp-size $TP
-#                    Also selects MoE backend / chunked-prefill-size:
-#                      true  -> deepep + mega_moe + chunked-prefill 32768
-#                      false -> flashinfer_mxfp4  + chunked-prefill 8192
-#
-# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3,
-# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands.
+#                    Also selects MoE backend / chunked-prefill / EAGLE chain
+#                    / mem-fraction-static / max-running-requests:
+#                      true  -> flashinfer_mxfp4 + DP-attn + chunked-prefill 32768
+#                               + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256
+#                      false -> flashinfer_mxfp4 (TP-only) + chunked-prefill 8192
+#                               + EAGLE (3,1,4) + mem-fraction 0.90 + max-running CONC*3/2
 check_env_vars \
     MODEL \
     TP \
@@ -63,40 +63,53 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow.
+# Recipe path is selected by DP_ATTENTION; MoE backend, chunked-prefill, EAGLE
+# chain, mem-fraction, and max-running all follow.
 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
-# MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe.
-SPEC_FLAGS=(
-    --speculative-algorithm EAGLE
-    --speculative-num-steps 3
-    --speculative-eagle-topk 1
-    --speculative-num-draft-tokens 4
-)
-
 if [ "${DP_ATTENTION}" = "true" ]; then
-    # Large-batch EP path: deepep + mega_moe.
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+    # DP-attn path: flashinfer_mxfp4 + DP-attn (covers conc 16-256).
+    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
+    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
+    export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
     export SGLANG_OPT_USE_FAST_MASK_EP=1
     export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
     export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
     export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
     export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    SPEC_FLAGS=(
+        --speculative-algorithm EAGLE
+        --speculative-num-steps 1
+        --speculative-eagle-topk 1
+        --speculative-num-draft-tokens 2
+    )
     PARALLEL_ARGS=(
         --dp-size "$TP"
         --enable-dp-attention
-        --moe-a2a-backend deepep
+        --moe-runner-backend flashinfer_mxfp4
+        --disable-flashinfer-autotune
         --deepep-config "$DEEPEP_CONFIG"
+        --cuda-graph-max-bs 256
+        --schedule-conservativeness 2
     )
     CHUNKED_PREFILL_SIZE=32768
+    MEM_FRACTION_STATIC=0.92
+    MAX_RUNNING_REQUESTS=256
 else
-    # Small-batch TP-only path: flashinfer_mxfp4.
+    # TP-only fallback for low-conc: flashinfer_mxfp4 + EAGLE (3,1,4).
+    SPEC_FLAGS=(
+        --speculative-algorithm EAGLE
+        --speculative-num-steps 3
+        --speculative-eagle-topk 1
+        --speculative-num-draft-tokens 4
+    )
     PARALLEL_ARGS=(
         --moe-runner-backend flashinfer_mxfp4
         --disable-flashinfer-autotune
     )
     CHUNKED_PREFILL_SIZE=8192
+    MEM_FRACTION_STATIC=0.90
+    MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))"
 fi
 
 # Print all SGLANG_* env vars to both the CI step log and server.log so the
@@ -116,8 +129,8 @@ PYTHONNOUSERSITE=1 sglang serve \
     --tp $TP \
     --ep-size $EP_SIZE \
     --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
-    --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
-    --mem-fraction-static 0.90 \
+    --max-running-requests "$MAX_RUNNING_REQUESTS" \
+    --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio 0.1 \
     "${SPEC_FLAGS[@]}" \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &

From 01d7a9fc78b6a68f07cbdb00a2f0a3da69a6b72d Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sun, 26 Apr 2026 22:18:21 -0700
Subject: [PATCH 5/8] drop tp4 ep1 tp-only band

---
 .github/configs/nvidia-master.yaml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index b523c6d2c..d9eae3e6b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1883,23 +1883,19 @@ dsv4-fp4-b300-sglang-mtp:
   precision: fp4
   framework: sglang
   multinode: false
-  # Three CONC bands:
+  # Two CONC bands:
   #   A: TP=8 ep=1            -- conc 1-8    EAGLE (3,1,4) TP-only fallback
-  #   B: TP=4 ep=1            -- conc 16-128 EAGLE (3,1,4) TP-only mid batch
   #   C: TP=4 ep=1 dp-attn    -- conc 16-256 EAGLE (1,1,2) DP-attn flashinfer
-  # Overlap: B/C at conc 16-128 (TP-only vs DP-attn head-to-head).
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
-    - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
     - { tp: 4, ep: 1, dp-attn: true, conc-start: 16, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
-    - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
     - { tp: 4, ep: 1, dp-attn: true, conc-start: 16, conc-end: 256, spec-decoding: mtp }
 
 qwen3.5-bf16-b200-sglang:

From 74d5b694b3d806cc88e7dfc4c362cbc9b650d2d0 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sun, 26 Apr 2026 22:19:50 -0700
Subject: [PATCH 6/8] restore tp4 ep1 band; conc 4-32

---
 .github/configs/nvidia-master.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index d9eae3e6b..1048bf67d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1883,19 +1883,22 @@ dsv4-fp4-b300-sglang-mtp:
   precision: fp4
   framework: sglang
   multinode: false
-  # Two CONC bands:
+  # Three CONC bands:
   #   A: TP=8 ep=1            -- conc 1-8    EAGLE (3,1,4) TP-only fallback
+  #   B: TP=4 ep=1            -- conc 4-32   EAGLE (3,1,4) TP-only mid batch
   #   C: TP=4 ep=1 dp-attn    -- conc 16-256 EAGLE (1,1,2) DP-attn flashinfer
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
     - { tp: 4, ep: 1, dp-attn: true, conc-start: 16, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
     - { tp: 4, ep: 1, dp-attn: true, conc-start: 16, conc-end: 256, spec-decoding: mtp }
 
 qwen3.5-bf16-b200-sglang:

From f66a2df1550de942cade64ef32e793149e79adab Mon Sep 17 00:00:00 2001
From: Qiaolin-Yu <liin1211@outlook.com>
Date: Mon, 27 Apr 2026 02:34:10 -0700
Subject: [PATCH 7/8] remove useless points

---
 .github/configs/nvidia-master.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1048bf67d..cb3d6d727 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1893,13 +1893,11 @@ dsv4-fp4-b300-sglang-mtp:
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-    - { tp: 4, ep: 1, dp-attn: true, conc-start: 16, conc-end: 256, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-    - { tp: 4, ep: 1, dp-attn: true, conc-start: 16, conc-end: 256, spec-decoding: mtp }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e

From e45c425b976febb5bc910b54fdbec56ab4cc6ac3 Mon Sep 17 00:00:00 2001
From: Qiaolin-Yu <liin1211@outlook.com>
Date: Mon, 27 Apr 2026 12:58:12 -0700
Subject: [PATCH 8/8] fix

---
 benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index b5f920c6c..03102778d 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -90,7 +90,6 @@ if [ "${DP_ATTENTION}" = "true" ]; then
         --disable-flashinfer-autotune
         --deepep-config "$DEEPEP_CONFIG"
         --cuda-graph-max-bs 256
-        --schedule-conservativeness 2
     )
     CHUNKED_PREFILL_SIZE=32768
     MEM_FRACTION_STATIC=0.92