From 148223d4af01567af9ff7af893fcdba8cc1d6f14 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sat, 25 Apr 2026 20:50:05 -0700
Subject: [PATCH 01/17] sglang dsv4 mtp

---
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh   | 170 ++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100755 benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
new file mode 100755
index 000000000..4383c408f
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
+# `hf download`. Only fetch when MODEL looks like a HF repo ID.
+if [[ "$MODEL" != /* ]]; then
+    hf download "$MODEL"
+fi
+
+nvidia-smi
+
+export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+
+# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
+# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
+# The runner mounts our repo at a non-/workspace path for these images so the
+# editable install stays visible. Paths in this script are $PWD-relative for
+# that reason. Drop the runner conditional once lmsys moves sglang back out of
+# /workspace.
+
+SERVER_LOG="$PWD/server.log"
+PORT=${PORT:-8888}
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
+
+# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+# with EAGLE / MTP enabled:
+#   - low-latency    (CONC <= 32):        TP-only, flashinfer_mxfp4 MoE
+#   - balanced       (32 < CONC <= 128):  + DP-attn, mega-moe EP
+#   - max-throughput (CONC > 128):        + DP-attn, mega-moe EP, max-running-requests=512
+# Speculative-decoding flags follow the cookbook EAGLE config.
+DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+# MTP (EAGLE) speculative-decoding flags applied to every recipe.
+SPEC_FLAGS=(
+    --speculative-algorithm EAGLE
+    --speculative-num-steps 3
+    --speculative-eagle-topk 1
+    --speculative-num-draft-tokens 4
+)
+
+if [[ $CONC -le 32 ]]; then
+    RECIPE=low-latency
+    export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
+    # common optimizations
+    export SGLANG_OPT_USE_JIT_NORM=1
+    export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
+    export SGLANG_OPT_USE_TOPK_V2=1
+    export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
+    RECIPE_FLAGS=(
+        --moe-runner-backend flashinfer_mxfp4
+        --chunked-prefill-size 32768
+        --disable-flashinfer-autotune
+        --mem-fraction-static 0.90
+        --max-running-requests 32
+        --swa-full-tokens-ratio 0.1
+    )
+elif [[ $CONC -le 128 ]]; then
+    RECIPE=balanced
+    export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
+    # common optimizations
+    export SGLANG_OPT_USE_JIT_NORM=1
+    export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
+    export SGLANG_OPT_USE_TOPK_V2=1
+    export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
+    # MoE EP related flags
+    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+    export SGLANG_OPT_USE_FAST_MASK_EP=1
+    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
+    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.83
+        --max-running-requests 128
+        --chunked-prefill-size 32768
+        --swa-full-tokens-ratio 0.1
+    )
+else
+    RECIPE=max-throughput
+    export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
+    # common optimizations
+    export SGLANG_OPT_USE_JIT_NORM=1
+    export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
+    export SGLANG_OPT_USE_TOPK_V2=1
+    export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
+    # MoE EP related flags
+    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+    export SGLANG_OPT_USE_FAST_MASK_EP=1
+    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
+    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.90
+        --max-running-requests 512
+        --chunked-prefill-size 32768
+        --swa-full-tokens-ratio 0.1
+    )
+fi
+echo "Recipe: $RECIPE (CONC=$CONC)"
+
+set -x
+PYTHONNOUSERSITE=1 sglang serve \
+    --model-path $MODEL \
+    --host 0.0.0.0 \
+    --port $PORT \
+    --trust-remote-code \
+    --tp $TP \
+    "${SPEC_FLAGS[@]}" \
+    "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $((CONC * 10)) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir "$PWD/" \
+    --use-chat-template
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x

From c883e8dd66585f4ac6c8ef21357ef2e610fbcc37 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sat, 25 Apr 2026 21:02:14 -0700
Subject: [PATCH 02/17] knob-driven recipe selection

---
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh   | 144 ++++++++----------
 1 file changed, 67 insertions(+), 77 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 4383c408f..0ac9d017d 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -2,9 +2,25 @@
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
+# Tuning knobs (matrix-driven, all required - no script-side defaults):
+#   TP                   -- tensor parallel size                       -> --tp
+#   EP_SIZE              -- expert parallel size                       -> --ep-size
+#   DP_ATTENTION         -- "true" enables --enable-dp-attention --dp-size $TP
+#   MOE_RUNNER_BACKEND   -- recipe label, one of: deepep | flashinfer_mxfp4
+#                            deepep           -> --moe-a2a-backend deepep + mega_moe env vars
+#                            flashinfer_mxfp4 -> --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune
+#   CHUNKED_PREFILL_SIZE -- --chunked-prefill-size value (e.g. 8192, 32768)
+#
+# MTP/EAGLE speculative-decoding flags are applied unconditionally on top of
+# every recipe (same draft chain across CONC ranges). Tuning the spec config
+# per recipe is left as future work once we have sweep data.
 check_env_vars \
     MODEL \
     TP \
+    EP_SIZE \
+    DP_ATTENTION \
+    MOE_RUNNER_BACKEND \
+    CHUNKED_PREFILL_SIZE \
     CONC \
     ISL \
     OSL \
@@ -23,7 +39,13 @@ fi
 
 nvidia-smi
 
+# Common SGLANG env vars (apply to every config).
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
+export SGLANG_OPT_USE_JIT_NORM=1
+export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
+export SGLANG_OPT_USE_TOPK_V2=1
+export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 
 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at
 # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
@@ -35,7 +57,7 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, MOE_RUNNER_BACKEND: $MOE_RUNNER_BACKEND, CHUNKED_PREFILL_SIZE: $CHUNKED_PREFILL_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -45,12 +67,7 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# with EAGLE / MTP enabled:
-#   - low-latency    (CONC <= 32):        TP-only, flashinfer_mxfp4 MoE
-#   - balanced       (32 < CONC <= 128):  + DP-attn, mega-moe EP
-#   - max-throughput (CONC > 128):        + DP-attn, mega-moe EP, max-running-requests=512
-# Speculative-decoding flags follow the cookbook EAGLE config.
+# Recipe path is selected by MOE_RUNNER_BACKEND. DP-attention applies orthogonally below.
 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
 # MTP (EAGLE) speculative-decoding flags applied to every recipe.
@@ -61,76 +78,44 @@ SPEC_FLAGS=(
     --speculative-num-draft-tokens 4
 )
 
-if [[ $CONC -le 32 ]]; then
-    RECIPE=low-latency
-    export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
-    # common optimizations
-    export SGLANG_OPT_USE_JIT_NORM=1
-    export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
-    export SGLANG_OPT_USE_TOPK_V2=1
-    export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
-    RECIPE_FLAGS=(
-        --moe-runner-backend flashinfer_mxfp4
-        --chunked-prefill-size 32768
-        --disable-flashinfer-autotune
-        --mem-fraction-static 0.90
-        --max-running-requests 32
-        --swa-full-tokens-ratio 0.1
-    )
-elif [[ $CONC -le 128 ]]; then
-    RECIPE=balanced
-    export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
-    # common optimizations
-    export SGLANG_OPT_USE_JIT_NORM=1
-    export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
-    export SGLANG_OPT_USE_TOPK_V2=1
-    export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
-    # MoE EP related flags
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-    export SGLANG_OPT_USE_FAST_MASK_EP=1
-    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
-    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    RECIPE_FLAGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend deepep
-        --deepep-config "$DEEPEP_CONFIG"
-        --mem-fraction-static 0.83
-        --max-running-requests 128
-        --chunked-prefill-size 32768
-        --swa-full-tokens-ratio 0.1
-    )
-else
-    RECIPE=max-throughput
-    export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
-    # common optimizations
-    export SGLANG_OPT_USE_JIT_NORM=1
-    export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
-    export SGLANG_OPT_USE_TOPK_V2=1
-    export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
-    # MoE EP related flags
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-    export SGLANG_OPT_USE_FAST_MASK_EP=1
-    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
-    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    RECIPE_FLAGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend deepep
-        --deepep-config "$DEEPEP_CONFIG"
-        --mem-fraction-static 0.90
-        --max-running-requests 512
-        --chunked-prefill-size 32768
-        --swa-full-tokens-ratio 0.1
-    )
+case "${MOE_RUNNER_BACKEND}" in
+    deepep)
+        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+        export SGLANG_OPT_USE_FAST_MASK_EP=1
+        export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
+        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
+        export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
+        export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+        PARALLEL_ARGS=(
+            --moe-a2a-backend deepep
+            --deepep-config "$DEEPEP_CONFIG"
+        )
+        ;;
+    flashinfer_mxfp4)
+        PARALLEL_ARGS=(
+            --moe-runner-backend flashinfer_mxfp4
+            --disable-flashinfer-autotune
+        )
+        ;;
+    *)
+        echo "ERROR: unknown MOE_RUNNER_BACKEND='${MOE_RUNNER_BACKEND}' (expected: deepep | flashinfer_mxfp4)" >&2
+        exit 1
+        ;;
+esac
+
+# DP-attention is orthogonal to MOE_RUNNER_BACKEND.
+if [ "${DP_ATTENTION}" = "true" ]; then
+    PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention)
 fi
-echo "Recipe: $RECIPE (CONC=$CONC)"
+
+# Print all SGLANG_* env vars to both the CI step log and server.log so the
+# launch config is auditable from the result artifact alone.
+{
+    echo "=== SGLANG_* env vars at launch ==="
+    env | grep -E '^SGLANG_' | sort
+    echo "==================================="
+} | tee "$SERVER_LOG"
 
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
@@ -139,8 +124,13 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
+    --ep-size $EP_SIZE \
+    --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
+    --max-running-requests "$((CONC * 3 / 2))" \
+    --mem-fraction-static 0.90 \
+    --swa-full-tokens-ratio 0.1 \
     "${SPEC_FLAGS[@]}" \
-    "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+    "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

From 3a49ed12d0aa44c79141d8ba573390230f7275b2 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sat, 25 Apr 2026 21:09:48 -0700
Subject: [PATCH 03/17] self-contained mtp config; recipe via dp-attn

---
 .github/configs/nvidia-master.yaml            | 29 ++++++++
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh   | 69 ++++++++-----------
 perf-changelog.yaml                           | 11 +++
 3 files changed, 70 insertions(+), 39 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 42c720a63..0351ab754 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1873,6 +1873,35 @@ dsv4-fp4-b300-sglang:
     # max-throughput
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
 
+# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
+# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
+# DP_ATTENTION:
+#   dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192
+#   dp-attn: true  -> DP-attn + deepep mega_moe + chunked-prefill 32768
+# `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
+# while the TP-only path leaves ep_size at the default of 1.
+dsv4-fp4-b300-sglang-mtp:
+  image: lmsysorg/sglang:deepseek-v4-b300
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
+    - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
+    - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp }
+
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
   model: Qwen/Qwen3.5-397B-A17B
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 0ac9d017d..deac9b9ca 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -3,13 +3,12 @@
 source "$(dirname "$0")/../benchmark_lib.sh"
 
 # Tuning knobs (matrix-driven, all required - no script-side defaults):
-#   TP                   -- tensor parallel size                       -> --tp
-#   EP_SIZE              -- expert parallel size                       -> --ep-size
-#   DP_ATTENTION         -- "true" enables --enable-dp-attention --dp-size $TP
-#   MOE_RUNNER_BACKEND   -- recipe label, one of: deepep | flashinfer_mxfp4
-#                            deepep           -> --moe-a2a-backend deepep + mega_moe env vars
-#                            flashinfer_mxfp4 -> --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune
-#   CHUNKED_PREFILL_SIZE -- --chunked-prefill-size value (e.g. 8192, 32768)
+#   TP            -- tensor parallel size                       -> --tp
+#   EP_SIZE       -- expert parallel size                       -> --ep-size
+#   DP_ATTENTION  -- "true" enables --enable-dp-attention --dp-size $TP
+#                    Also selects MoE backend / chunked-prefill-size:
+#                      true  -> deepep + mega_moe + chunked-prefill 32768
+#                      false -> flashinfer_mxfp4  + chunked-prefill 8192
 #
 # MTP/EAGLE speculative-decoding flags are applied unconditionally on top of
 # every recipe (same draft chain across CONC ranges). Tuning the spec config
@@ -19,8 +18,6 @@ check_env_vars \
     TP \
     EP_SIZE \
     DP_ATTENTION \
-    MOE_RUNNER_BACKEND \
-    CHUNKED_PREFILL_SIZE \
     CONC \
     ISL \
     OSL \
@@ -57,7 +54,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
-echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, MOE_RUNNER_BACKEND: $MOE_RUNNER_BACKEND, CHUNKED_PREFILL_SIZE: $CHUNKED_PREFILL_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -67,7 +64,7 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# Recipe path is selected by MOE_RUNNER_BACKEND. DP-attention applies orthogonally below.
+# Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow.
 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
 # MTP (EAGLE) speculative-decoding flags applied to every recipe.
@@ -78,35 +75,29 @@ SPEC_FLAGS=(
     --speculative-num-draft-tokens 4
 )
 
-case "${MOE_RUNNER_BACKEND}" in
-    deepep)
-        export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-        export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-        export SGLANG_OPT_USE_FAST_MASK_EP=1
-        export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-        export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
-        export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
-        export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-        PARALLEL_ARGS=(
-            --moe-a2a-backend deepep
-            --deepep-config "$DEEPEP_CONFIG"
-        )
-        ;;
-    flashinfer_mxfp4)
-        PARALLEL_ARGS=(
-            --moe-runner-backend flashinfer_mxfp4
-            --disable-flashinfer-autotune
-        )
-        ;;
-    *)
-        echo "ERROR: unknown MOE_RUNNER_BACKEND='${MOE_RUNNER_BACKEND}' (expected: deepep | flashinfer_mxfp4)" >&2
-        exit 1
-        ;;
-esac
-
-# DP-attention is orthogonal to MOE_RUNNER_BACKEND.
 if [ "${DP_ATTENTION}" = "true" ]; then
-    PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention)
+    # Large-batch EP path: deepep + mega_moe.
+    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+    export SGLANG_OPT_USE_FAST_MASK_EP=1
+    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
+    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    PARALLEL_ARGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+    )
+    CHUNKED_PREFILL_SIZE=32768
+else
+    # Small-batch TP-only path: flashinfer_mxfp4.
+    PARALLEL_ARGS=(
+        --moe-runner-backend flashinfer_mxfp4
+        --disable-flashinfer-autotune
+    )
+    CHUNKED_PREFILL_SIZE=8192
 fi
 
 # Print all SGLANG_* env vars to both the CI step log and server.log so the
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7ed3c16ff..12278037e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1833,3 +1833,14 @@
     - "Bump --chunked-prefill-size from 4096 to 8192"
     - "Retrigger dsv4-fp8-mi355x-sglang"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160
+
+- config-keys:
+    - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "EAGLE flags: num-steps=3, eagle-topk=1, num-draft-tokens=4"
+    - "Recipe selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
+    - "Configs: 1k1k and 8k1k, tp 4/8 with conc 1-512"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166

From 6f1b80a639f81e332005ff1c10f04f825b35bf04 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sat, 25 Apr 2026 21:20:24 -0700
Subject: [PATCH 04/17] add mtp_1 (1/1/2) variant

---
 .github/configs/nvidia-master.yaml            | 13 +++++
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh   | 52 ++++++++++++-------
 perf-changelog.yaml                           |  2 +-
 runners/launch_b300-nv.sh                     |  2 +-
 utils/matrix_logic/validation.py              |  8 +--
 5 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0351ab754..c0d118895 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1888,19 +1888,32 @@ dsv4-fp4-b300-sglang-mtp:
   precision: fp4
   framework: sglang
   multinode: false
+  # Two EAGLE chain lengths sweep side-by-side per (tp, ep, dp-attn) combo:
+  #   mtp    -> num-steps=3, eagle-topk=1, num-draft-tokens=4 (default chain)
+  #   mtp_1  -> num-steps=1, eagle-topk=1, num-draft-tokens=2 (single-step)
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
+    # mtp (3/1/4)
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp }
+    # mtp_1 (1/1/2)
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp_1 }
+    - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp_1 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp_1 }
   - isl: 8192
     osl: 1024
     search-space:
+    # mtp (3/1/4)
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp }
+    # mtp_1 (1/1/2)
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp_1 }
+    - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp_1 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp_1 }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index deac9b9ca..56a9d6899 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -3,21 +3,21 @@
 source "$(dirname "$0")/../benchmark_lib.sh"
 
 # Tuning knobs (matrix-driven, all required - no script-side defaults):
-#   TP            -- tensor parallel size                       -> --tp
-#   EP_SIZE       -- expert parallel size                       -> --ep-size
-#   DP_ATTENTION  -- "true" enables --enable-dp-attention --dp-size $TP
-#                    Also selects MoE backend / chunked-prefill-size:
-#                      true  -> deepep + mega_moe + chunked-prefill 32768
-#                      false -> flashinfer_mxfp4  + chunked-prefill 8192
-#
-# MTP/EAGLE speculative-decoding flags are applied unconditionally on top of
-# every recipe (same draft chain across CONC ranges). Tuning the spec config
-# per recipe is left as future work once we have sweep data.
+#   TP             -- tensor parallel size                       -> --tp
+#   EP_SIZE        -- expert parallel size                       -> --ep-size
+#   DP_ATTENTION   -- "true" enables --enable-dp-attention --dp-size $TP
+#                     Also selects MoE backend / chunked-prefill-size:
+#                       true  -> deepep + mega_moe + chunked-prefill 32768
+#                       false -> flashinfer_mxfp4  + chunked-prefill 8192
+#   SPEC_DECODING  -- selects EAGLE chain length:
+#                       mtp    -> num-steps=3, eagle-topk=1, num-draft-tokens=4 (default)
+#                       mtp_1  -> num-steps=1, eagle-topk=1, num-draft-tokens=2 (single-step)
 check_env_vars \
     MODEL \
     TP \
     EP_SIZE \
     DP_ATTENTION \
+    SPEC_DECODING \
     CONC \
     ISL \
     OSL \
@@ -54,7 +54,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
-echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, SPEC_DECODING: $SPEC_DECODING, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -67,13 +67,29 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 # Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow.
 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
-# MTP (EAGLE) speculative-decoding flags applied to every recipe.
-SPEC_FLAGS=(
-    --speculative-algorithm EAGLE
-    --speculative-num-steps 3
-    --speculative-eagle-topk 1
-    --speculative-num-draft-tokens 4
-)
+# MTP (EAGLE) speculative-decoding flags. Chain length selected by SPEC_DECODING.
+case "${SPEC_DECODING}" in
+    mtp_1)
+        SPEC_FLAGS=(
+            --speculative-algorithm EAGLE
+            --speculative-num-steps 1
+            --speculative-eagle-topk 1
+            --speculative-num-draft-tokens 2
+        )
+        ;;
+    mtp)
+        SPEC_FLAGS=(
+            --speculative-algorithm EAGLE
+            --speculative-num-steps 3
+            --speculative-eagle-topk 1
+            --speculative-num-draft-tokens 4
+        )
+        ;;
+    *)
+        echo "ERROR: unsupported SPEC_DECODING='${SPEC_DECODING}' (expected: mtp | mtp_1)" >&2
+        exit 1
+        ;;
+esac
 
 if [ "${DP_ATTENTION}" = "true" ]; then
     # Large-batch EP path: deepep + mega_moe.
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 12278037e..f5970f126 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1840,7 +1840,7 @@
     - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
     - "Image: lmsysorg/sglang:deepseek-v4-b300"
     - "Model: deepseek-ai/DeepSeek-V4-Pro"
-    - "EAGLE flags: num-steps=3, eagle-topk=1, num-draft-tokens=4"
+    - "Two EAGLE chain lengths swept side-by-side per (tp, ep, dp-attn) combo: mtp=3/1/4 (default) and mtp_1=1/1/2 (single-step)"
     - "Recipe selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
     - "Configs: 1k1k and 8k1k, tp 4/8 with conc 1-512"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 3c855e805..27760df4b 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -259,7 +259,7 @@ else
         export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro"
     fi
     SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-    SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+    SPEC_SUFFIX=$([[ "$SPEC_DECODING" == mtp* ]] && printf '_mtp' || printf '')
     # Prefer a framework-tagged script (e.g. dsv4_fp4_b300_sglang.sh) so models
     # with multiple inference engines can coexist; fall back to the historical
     # name without an engine suffix (`_trt` for trt, bare for everyone else)
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index ce10840b5..9210e0b07 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -77,7 +77,7 @@ class SingleNodeMatrixEntry(BaseModel):
     model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
     precision: str
     framework: str
-    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
+    spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field(
         alias=Fields.SPEC_DECODING.value
     )
     runner: str
@@ -116,7 +116,7 @@ class MultiNodeMatrixEntry(BaseModel):
     model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
     precision: str
     framework: str
-    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
+    spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field(
         alias=Fields.SPEC_DECODING.value
     )
     runner: str
@@ -204,7 +204,7 @@ class SingleNodeSearchSpaceEntry(BaseModel):
 
     tp: int
     ep: Optional[int] = None
-    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
+    spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field(
         default="none", alias=Fields.SPEC_DECODING.value)
     dp_attn: Optional[bool] = Field(
         default=None, alias=Fields.DP_ATTN.value)
@@ -224,7 +224,7 @@ class MultiNodeSearchSpaceEntry(BaseModel):
     """Multinode search space configuration."""
     model_config = ConfigDict(extra='forbid', populate_by_name=True)
 
-    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
+    spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field(
         default="none", alias=Fields.SPEC_DECODING.value)
     prefill: WorkerConfig
     decode: WorkerConfig

From 1b34a8d0d3068914a5965aafa7a2225028c012d6 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sat, 25 Apr 2026 21:34:15 -0700
Subject: [PATCH 05/17] knob-driven recipe selection

---
 .github/configs/nvidia-master.yaml | 33 +++++++++++++-----------------
 perf-changelog.yaml                |  8 +++++---
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c0d118895..0b09dc048 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1888,32 +1888,27 @@ dsv4-fp4-b300-sglang-mtp:
   precision: fp4
   framework: sglang
   multinode: false
-  # Two EAGLE chain lengths sweep side-by-side per (tp, ep, dp-attn) combo:
-  #   mtp    -> num-steps=3, eagle-topk=1, num-draft-tokens=4 (default chain)
-  #   mtp_1  -> num-steps=1, eagle-topk=1, num-draft-tokens=2 (single-step)
+  # Four configs dispatched by CONC, with overlap at transitions:
+  #   A: TP=8 ep=1, mtp (3/1/4)            -- conc 1-8     (latency-bound)
+  #   B: TP=4 ep=1, mtp (3/1/4)            -- conc 16-128  (TP-only mid batch)
+  #   C: TP=4 ep=4 dp-attn, mtp (3/1/4)    -- conc 64-256  (DP-attn + EP)
+  #   D: TP=4 ep=4 dp-attn, mtp_1 (1/1/2)  -- conc 256-512 (short spec at large batch)
+  # Overlaps: B/C at conc 64,128 (TP-only vs DP-attn EP); C/D at 256 (3/1/4 vs 1/1/2).
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    # mtp (3/1/4)
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
-    - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp }
-    # mtp_1 (1/1/2)
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp_1 }
-    - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp_1 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp_1 }
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp_1 }
   - isl: 8192
     osl: 1024
     search-space:
-    # mtp (3/1/4)
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp }
-    - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp }
-    # mtp_1 (1/1/2)
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp_1 }
-    - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp_1 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp_1 }
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+    - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp_1 }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index f5970f126..e82104fdc 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1840,7 +1840,9 @@
     - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
     - "Image: lmsysorg/sglang:deepseek-v4-b300"
     - "Model: deepseek-ai/DeepSeek-V4-Pro"
-    - "Two EAGLE chain lengths swept side-by-side per (tp, ep, dp-attn) combo: mtp=3/1/4 (default) and mtp_1=1/1/2 (single-step)"
-    - "Recipe selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
-    - "Configs: 1k1k and 8k1k, tp 4/8 with conc 1-512"
+    - "Four configs dispatched by CONC: A=TP8/mtp (1-8), B=TP4/mtp (16-128), C=DP4/mtp (64-256), D=DP4/mtp_1 (256-512)"
+    - "Overlaps for head-to-head comparison: B/C at conc 64,128; C/D at conc 256"
+    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn"
+    - "EAGLE chain selected by spec-decoding: mtp=3/1/4 vs mtp_1=1/1/2"
+    - "Configs: 1k1k and 8k1k, total 26 sweep entries"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166

From 481482ac44cbe021795176f21c462606419ea250 Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sat, 25 Apr 2026 22:03:14 -0700
Subject: [PATCH 06/17] pin sglang image to mega_moe-capable digest

---
 .github/configs/nvidia-master.yaml | 2 +-
 perf-changelog.yaml                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0b09dc048..35a36b728 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1881,7 +1881,7 @@ dsv4-fp4-b300-sglang:
 # `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
 # while the TP-only path leaves ep_size at the default of 1.
 dsv4-fp4-b300-sglang-mtp:
-  image: lmsysorg/sglang:deepseek-v4-b300
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e82104fdc..7541a81ec 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1838,7 +1838,7 @@
     - dsv4-fp4-b300-sglang-mtp
   description:
     - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
-    - "Image: lmsysorg/sglang:deepseek-v4-b300"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234 (pinned for deep_gemm transform_weights_for_mega_moe support)"
     - "Model: deepseek-ai/DeepSeek-V4-Pro"
     - "Four configs dispatched by CONC: A=TP8/mtp (1-8), B=TP4/mtp (16-128), C=DP4/mtp (64-256), D=DP4/mtp_1 (256-512)"
     - "Overlaps for head-to-head comparison: B/C at conc 64,128; C/D at conc 256"

From 47fefec19307f8d77558701d3e312365a2ad2c4c Mon Sep 17 00:00:00 2001
From: Liangsheng Yin <lsyincs@gmail.com>
Date: Sat, 25 Apr 2026 22:18:47 -0700
Subject: [PATCH 07/17] drop mtp_1 knob; align with PR #1158 image digest

---
 .github/configs/nvidia-master.yaml            | 19 +++----
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh   | 53 +++++++------------
 perf-changelog.yaml                           | 11 ++--
 runners/launch_b300-nv.sh                     |  2 +-
 utils/matrix_logic/validation.py              |  8 +--
 5 files changed, 36 insertions(+), 57 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 35a36b728..ee3d4dc9e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1881,34 +1881,31 @@ dsv4-fp4-b300-sglang:
 # `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
 # while the TP-only path leaves ep_size at the default of 1.
 dsv4-fp4-b300-sglang-mtp:
-  image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
   precision: fp4
   framework: sglang
   multinode: false
-  # Four configs dispatched by CONC, with overlap at transitions:
-  #   A: TP=8 ep=1, mtp (3/1/4)            -- conc 1-8     (latency-bound)
-  #   B: TP=4 ep=1, mtp (3/1/4)            -- conc 16-128  (TP-only mid batch)
-  #   C: TP=4 ep=4 dp-attn, mtp (3/1/4)    -- conc 64-256  (DP-attn + EP)
-  #   D: TP=4 ep=4 dp-attn, mtp_1 (1/1/2)  -- conc 256-512 (short spec at large batch)
-  # Overlaps: B/C at conc 64,128 (TP-only vs DP-attn EP); C/D at 256 (3/1/4 vs 1/1/2).
+  # Three CONC bands sweep with EAGLE/MTP (3/1/4) on top:
+  #   A: TP=8 ep=1            -- conc 1-8    (latency-bound, full TP)
+  #   B: TP=4 ep=1            -- conc 16-128 (TP-only, mid batch)
+  #   C: TP=4 ep=4 dp-attn    -- conc 64-512 (DP-attn + EP, large batch)
+  # Overlap: B/C at conc 64,128 (TP-only vs DP-attn EP head-to-head).
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp_1 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
     - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp_1 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 56a9d6899..7f012c5b2 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -2,22 +2,21 @@
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
-# Tuning knobs (matrix-driven, all required - no script-side defaults):
-#   TP             -- tensor parallel size                       -> --tp
-#   EP_SIZE        -- expert parallel size                       -> --ep-size
-#   DP_ATTENTION   -- "true" enables --enable-dp-attention --dp-size $TP
-#                     Also selects MoE backend / chunked-prefill-size:
-#                       true  -> deepep + mega_moe + chunked-prefill 32768
-#                       false -> flashinfer_mxfp4  + chunked-prefill 8192
-#   SPEC_DECODING  -- selects EAGLE chain length:
-#                       mtp    -> num-steps=3, eagle-topk=1, num-draft-tokens=4 (default)
-#                       mtp_1  -> num-steps=1, eagle-topk=1, num-draft-tokens=2 (single-step)
+# Tuning inputs from the matrix (all required):
+#   TP            -- tensor parallel size                       -> --tp
+#   EP_SIZE       -- expert parallel size                       -> --ep-size
+#   DP_ATTENTION  -- "true" enables --enable-dp-attention --dp-size $TP
+#                    Also selects MoE backend / chunked-prefill-size:
+#                      true  -> deepep + mega_moe + chunked-prefill 32768
+#                      false -> flashinfer_mxfp4  + chunked-prefill 8192
+#
+# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3,
+# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands.
 check_env_vars \
     MODEL \
     TP \
     EP_SIZE \
     DP_ATTENTION \
-    SPEC_DECODING \
     CONC \
     ISL \
     OSL \
@@ -54,7 +53,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
-echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, SPEC_DECODING: $SPEC_DECODING, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -67,29 +66,13 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 # Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow.
 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
-# MTP (EAGLE) speculative-decoding flags. Chain length selected by SPEC_DECODING.
-case "${SPEC_DECODING}" in
-    mtp_1)
-        SPEC_FLAGS=(
-            --speculative-algorithm EAGLE
-            --speculative-num-steps 1
-            --speculative-eagle-topk 1
-            --speculative-num-draft-tokens 2
-        )
-        ;;
-    mtp)
-        SPEC_FLAGS=(
-            --speculative-algorithm EAGLE
-            --speculative-num-steps 3
-            --speculative-eagle-topk 1
-            --speculative-num-draft-tokens 4
-        )
-        ;;
-    *)
-        echo "ERROR: unsupported SPEC_DECODING='${SPEC_DECODING}' (expected: mtp | mtp_1)" >&2
-        exit 1
-        ;;
-esac
+# MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe.
+SPEC_FLAGS=(
+    --speculative-algorithm EAGLE
+    --speculative-num-steps 3
+    --speculative-eagle-topk 1
+    --speculative-num-draft-tokens 4
+)
 
 if [ "${DP_ATTENTION}" = "true" ]; then
     # Large-batch EP path: deepep + mega_moe.
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7541a81ec..7dfa95310 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1838,11 +1838,10 @@
     - dsv4-fp4-b300-sglang-mtp
   description:
     - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
-    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234 (pinned for deep_gemm transform_weights_for_mega_moe support)"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
     - "Model: deepseek-ai/DeepSeek-V4-Pro"
-    - "Four configs dispatched by CONC: A=TP8/mtp (1-8), B=TP4/mtp (16-128), C=DP4/mtp (64-256), D=DP4/mtp_1 (256-512)"
-    - "Overlaps for head-to-head comparison: B/C at conc 64,128; C/D at conc 256"
-    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn"
-    - "EAGLE chain selected by spec-decoding: mtp=3/1/4 vs mtp_1=1/1/2"
-    - "Configs: 1k1k and 8k1k, total 26 sweep entries"
+    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
+    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
+    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
+    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 27760df4b..3c855e805 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -259,7 +259,7 @@ else
         export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro"
     fi
     SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-    SPEC_SUFFIX=$([[ "$SPEC_DECODING" == mtp* ]] && printf '_mtp' || printf '')
+    SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     # Prefer a framework-tagged script (e.g. dsv4_fp4_b300_sglang.sh) so models
     # with multiple inference engines can coexist; fall back to the historical
     # name without an engine suffix (`_trt` for trt, bare for everyone else)
diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py
index 9210e0b07..ce10840b5 100644
--- a/utils/matrix_logic/validation.py
+++ b/utils/matrix_logic/validation.py
@@ -77,7 +77,7 @@ class SingleNodeMatrixEntry(BaseModel):
     model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
     precision: str
     framework: str
-    spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field(
+    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
         alias=Fields.SPEC_DECODING.value
     )
     runner: str
@@ -116,7 +116,7 @@ class MultiNodeMatrixEntry(BaseModel):
     model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value)
     precision: str
     framework: str
-    spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field(
+    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
         alias=Fields.SPEC_DECODING.value
     )
     runner: str
@@ -204,7 +204,7 @@ class SingleNodeSearchSpaceEntry(BaseModel):
 
     tp: int
     ep: Optional[int] = None
-    spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field(
+    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
         default="none", alias=Fields.SPEC_DECODING.value)
     dp_attn: Optional[bool] = Field(
         default=None, alias=Fields.DP_ATTN.value)
@@ -224,7 +224,7 @@ class MultiNodeSearchSpaceEntry(BaseModel):
     """Multinode search space configuration."""
     model_config = ConfigDict(extra='forbid', populate_by_name=True)
 
-    spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field(
+    spec_decoding: Literal["mtp", "draft_model", "none"] = Field(
         default="none", alias=Fields.SPEC_DECODING.value)
     prefill: WorkerConfig
     decode: WorkerConfig

From 287ef26124bb16b71a61b11e47ad858afebe385c Mon Sep 17 00:00:00 2001
From: Yuhao Yang <47235274+yhyang201@users.noreply.github.com>
Date: Sun, 26 Apr 2026 17:37:49 +0800
Subject: [PATCH 08/17] update nvidia-master.yaml

---
 .github/configs/nvidia-master.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ee3d4dc9e..6acb8967b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1898,14 +1898,10 @@ dsv4-fp4-b300-sglang-mtp:
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
-    - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
-    - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e

From f64505b9ed22dc0f603570530cbc7ad70aac0b6c Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 17:50:18 +0800
Subject: [PATCH 09/17] fix: restore trailing newline in perf-changelog.yaml

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7ec0bf8aa..5ac45fdff 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1873,4 +1873,4 @@
     - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
     - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
     - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
\ No newline at end of file
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166

From 4f468d68012d7c611107f2a8279716be63df7af1 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 20:09:48 +0800
Subject: [PATCH 10/17] fix: remove --use-chat-template and floor
 --max-running-requests at 8

The tokenizer for DSv4-Pro has no chat_template set, so
--use-chat-template causes benchmark_serving.py to crash with
ValueError. Remove it to align with dsv4_fp4_b300_sglang.sh.

Also add a floor of 8 to --max-running-requests to match the
base script and avoid too-low values at low concurrency.
---
 benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 7f012c5b2..767b9a8f9 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -116,7 +116,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --tp $TP \
     --ep-size $EP_SIZE \
     --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
-    --max-running-requests "$((CONC * 3 / 2))" \
+    --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
     --mem-fraction-static 0.90 \
     --swa-full-tokens-ratio 0.1 \
     "${SPEC_FLAGS[@]}" \
@@ -138,8 +138,7 @@ run_benchmark_serving \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir "$PWD/" \
-    --use-chat-template
+    --result-dir "$PWD/"
 
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"

From fc93e84bda55448f4d30006b15eb7e99b6f0bbb1 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 20:13:11 +0800
Subject: [PATCH 11/17] perf-changelog: add dsv4-fp4-b300-sglang-mtp entry

Rebase perf-changelog.yaml on latest main (preserving #1173 and #1174
entries) and append the MTP config entry for PR #1166.
---
 perf-changelog.yaml | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5ac45fdff..4c85924b4 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1862,8 +1862,21 @@
     - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158
-  
-- config-keys:  
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "better performance for dp-attention"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174
+
+- config-keys:
     - dsv4-fp4-b300-sglang-mtp
   description:
     - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"

From cea70e55a839265e04009155e85a60ef5fe45e99 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 21:02:24 +0800
Subject: [PATCH 12/17] dsv4-b300-sglang: add conc=2048 8k1k recipe with finite
 request-rate

Add an ultra-high-concurrency DP-attention recipe (TP=8, deepep
mega_moe, chunked-prefill 65536, request-rate 16) for the 8k1k
workload at conc=2048.

To support finite request-rate, make benchmark_lib.sh's
run_benchmark_serving() accept an optional --request-rate parameter
(defaults to inf so all existing callers are unaffected).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  1 +
 benchmarks/benchmark_lib.sh                   |  7 +++-
 .../single_node/dsv4_fp4_b300_sglang.sh       | 32 +++++++++++++++++--
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3a7ba3df1..7b31bfe29 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1866,6 +1866,7 @@ dsv4-fp4-b300-sglang:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 268745735..9845ee38c 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -205,6 +205,7 @@ run_benchmark_serving() {
     local use_chat_template=false
     local dsv4=false
     local trust_remote_code=false
+    local request_rate="inf"
     local server_pid=""
 
     while [[ $# -gt 0 ]]; do
@@ -266,6 +267,10 @@ run_benchmark_serving() {
                 trust_remote_code=true
                 shift
                 ;;
+            --request-rate)
+                request_rate="$2"
+                shift 2
+                ;;
             --server-pid)
                 server_pid="$2"
                 shift 2
@@ -347,7 +352,7 @@ run_benchmark_serving() {
         --random-range-ratio "$random_range_ratio"
         --num-prompts "$num_prompts"
         --max-concurrency "$max_concurrency"
-        --request-rate inf
+        --request-rate "$request_rate"
         --ignore-eos
         "${profile_flag[@]}"
         --save-result
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index ac552c733..0d4940918 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -66,10 +66,35 @@ fi
 # single-instance uses flashinfer_mxfp4 with the cookbook defaults.
 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
-# Default; the DP-attn branch below overrides to 0.94.
+# Default; the DP-attn branches below override per recipe.
 MEM_FRACTION_STATIC=0.90
+MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))"
+REQUEST_RATE="inf"
 
-if [ "${DP_ATTENTION}" = "true" ]; then
+if [ "${DP_ATTENTION}" = "true" ] && [ "$CONC" -ge 2048 ]; then
+    # Ultra-high-concurrency DP-attention recipe: TP=8, deepep mega_moe backend.
+    export SGLANG_LOG_FORWARD_ITERS=1
+    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
+    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+    export SGLANG_OPT_USE_FAST_MASK_EP=1
+    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288
+    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    PARALLEL_ARGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --cuda-graph-max-bs 288
+        --deepep-config "$DEEPEP_CONFIG"
+        --chunked-prefill-size 65536
+        --enable-prefill-delayer
+    )
+    MEM_FRACTION_STATIC=0.87
+    MAX_RUNNING_REQUESTS=2560
+    REQUEST_RATE=16
+elif [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
     export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
@@ -111,7 +136,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
+    --max-running-requests "$MAX_RUNNING_REQUESTS" \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
@@ -131,6 +156,7 @@ run_benchmark_serving \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
+    --request-rate "$REQUEST_RATE" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir "$PWD/"
 

From 97a7e7d780b5a27023ae7db1821dda5a7fadf7dc Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 21:05:12 +0800
Subject: [PATCH 13/17] dsv4-b300-sglang: temporarily keep only conc=2048 8k1k
 for experiment

Remove 1k1k and other 8k1k search-space entries so CI only runs the
new conc=2048 recipe. Original configs noted in comments for restore.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 7b31bfe29..332cb23d1 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1854,18 +1854,13 @@ dsv4-fp4-b300-sglang:
   # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
   # while low-latency leaves ep_size at the default of 1.
   seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+  # NOTE: 1k1k and other 8k1k configs temporarily removed for conc=2048 experiment.
+  # Restore after experiment:
+  #   1k1k: tp8/ep1/conc1, tp4/ep1/conc32, tp4/ep4/dpa/conc512
+  #   8k1k: tp8/ep1/conc1, tp4/ep1/conc32, tp4/ep4/dpa/conc512
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is

From 628e47b1f796cbb8bb8e8e03c2c476d299d3bc85 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 21:10:38 +0800
Subject: [PATCH 14/17] Revert "dsv4-b300-sglang: temporarily keep only
 conc=2048 8k1k for experiment"

This reverts commit 97a7e7d780b5a27023ae7db1821dda5a7fadf7dc.
---
 .github/configs/nvidia-master.yaml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 332cb23d1..7b31bfe29 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1854,13 +1854,18 @@ dsv4-fp4-b300-sglang:
   # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
   # while low-latency leaves ep_size at the default of 1.
   seq-len-configs:
-  # NOTE: 1k1k and other 8k1k configs temporarily removed for conc=2048 experiment.
-  # Restore after experiment:
-  #   1k1k: tp8/ep1/conc1, tp4/ep1/conc32, tp4/ep4/dpa/conc512
-  #   8k1k: tp8/ep1/conc1, tp4/ep1/conc32, tp4/ep4/dpa/conc512
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
   - isl: 8192
     osl: 1024
     search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
+    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is

From 1526e9d8b04b9969ccb18ff12dd7c62d0127b320 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 21:11:11 +0800
Subject: [PATCH 15/17] Revert "dsv4-b300-sglang: add conc=2048 8k1k recipe
 with finite request-rate"

This reverts commit cea70e55a839265e04009155e85a60ef5fe45e99.
---
 .github/configs/nvidia-master.yaml            |  1 -
 benchmarks/benchmark_lib.sh                   |  7 +---
 .../single_node/dsv4_fp4_b300_sglang.sh       | 32 ++-----------------
 3 files changed, 4 insertions(+), 36 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 7b31bfe29..3a7ba3df1 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1866,7 +1866,6 @@ dsv4-fp4-b300-sglang:
     - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 9845ee38c..268745735 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -205,7 +205,6 @@ run_benchmark_serving() {
     local use_chat_template=false
     local dsv4=false
     local trust_remote_code=false
-    local request_rate="inf"
     local server_pid=""
 
     while [[ $# -gt 0 ]]; do
@@ -267,10 +266,6 @@ run_benchmark_serving() {
                 trust_remote_code=true
                 shift
                 ;;
-            --request-rate)
-                request_rate="$2"
-                shift 2
-                ;;
             --server-pid)
                 server_pid="$2"
                 shift 2
@@ -352,7 +347,7 @@ run_benchmark_serving() {
         --random-range-ratio "$random_range_ratio"
         --num-prompts "$num_prompts"
         --max-concurrency "$max_concurrency"
-        --request-rate "$request_rate"
+        --request-rate inf
         --ignore-eos
         "${profile_flag[@]}"
         --save-result
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index 0d4940918..ac552c733 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -66,35 +66,10 @@ fi
 # single-instance uses flashinfer_mxfp4 with the cookbook defaults.
 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
-# Default; the DP-attn branches below override per recipe.
+# Default; the DP-attn branch below overrides to 0.94.
 MEM_FRACTION_STATIC=0.90
-MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))"
-REQUEST_RATE="inf"
 
-if [ "${DP_ATTENTION}" = "true" ] && [ "$CONC" -ge 2048 ]; then
-    # Ultra-high-concurrency DP-attention recipe: TP=8, deepep mega_moe backend.
-    export SGLANG_LOG_FORWARD_ITERS=1
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
-    export SGLANG_OPT_USE_FAST_MASK_EP=1
-    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288
-    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend deepep
-        --cuda-graph-max-bs 288
-        --deepep-config "$DEEPEP_CONFIG"
-        --chunked-prefill-size 65536
-        --enable-prefill-delayer
-    )
-    MEM_FRACTION_STATIC=0.87
-    MAX_RUNNING_REQUESTS=2560
-    REQUEST_RATE=16
-elif [ "${DP_ATTENTION}" = "true" ]; then
+if [ "${DP_ATTENTION}" = "true" ]; then
     export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
     export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
     export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
@@ -136,7 +111,7 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --max-running-requests "$MAX_RUNNING_REQUESTS" \
+    --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
     --mem-fraction-static "$MEM_FRACTION_STATIC" \
     --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
     "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
@@ -156,7 +131,6 @@ run_benchmark_serving \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
-    --request-rate "$REQUEST_RATE" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir "$PWD/"
 

From 14369b1e66848dca58ab927629f51a05e5d480f1 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 21:12:32 +0800
Subject: [PATCH 16/17] dsv4-b300-sglang-mtp: tune EAGLE spec params from
 (3,1,4) to (4,1,5)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 767b9a8f9..d04661466 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -10,8 +10,8 @@ source "$(dirname "$0")/../benchmark_lib.sh"
 #                      true  -> deepep + mega_moe + chunked-prefill 32768
 #                      false -> flashinfer_mxfp4  + chunked-prefill 8192
 #
-# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3,
-# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands.
+# EAGLE/MTP speculative-decoding flags are hardcoded to (4, 1, 5): num-steps=4,
+# eagle-topk=1, num-draft-tokens=5. Same chain across all CONC bands.
 check_env_vars \
     MODEL \
     TP \
@@ -69,9 +69,9 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}
 # MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe.
 SPEC_FLAGS=(
     --speculative-algorithm EAGLE
-    --speculative-num-steps 3
+    --speculative-num-steps 4
     --speculative-eagle-topk 1
-    --speculative-num-draft-tokens 4
+    --speculative-num-draft-tokens 5
 )
 
 if [ "${DP_ATTENTION}" = "true" ]; then

From 42b294d47a6a62aede23b68ab21bc66eeebfb775 Mon Sep 17 00:00:00 2001
From: yhyang201 <yhyang201@gmail.com>
Date: Sun, 26 Apr 2026 21:20:46 +0800
Subject: [PATCH 17/17] Revert "dsv4-b300-sglang-mtp: tune EAGLE spec params
 from (3,1,4) to (4,1,5)"

This reverts commit 14369b1e66848dca58ab927629f51a05e5d480f1.
---
 benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index d04661466..767b9a8f9 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -10,8 +10,8 @@ source "$(dirname "$0")/../benchmark_lib.sh"
 #                      true  -> deepep + mega_moe + chunked-prefill 32768
 #                      false -> flashinfer_mxfp4  + chunked-prefill 8192
 #
-# EAGLE/MTP speculative-decoding flags are hardcoded to (4, 1, 5): num-steps=4,
-# eagle-topk=1, num-draft-tokens=5. Same chain across all CONC bands.
+# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3,
+# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands.
 check_env_vars \
     MODEL \
     TP \
@@ -69,9 +69,9 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}
 # MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe.
 SPEC_FLAGS=(
     --speculative-algorithm EAGLE
-    --speculative-num-steps 4
+    --speculative-num-steps 3
     --speculative-eagle-topk 1
-    --speculative-num-draft-tokens 5
+    --speculative-num-draft-tokens 4
 )
 
 if [ "${DP_ATTENTION}" = "true" ]; then