From 97b4aae7fcb9dec0b2b87e292cf606a79fba80d4 Mon Sep 17 00:00:00 2001
From: wzhao18 <wzhao18.sz@gmail.com>
Date: Sat, 25 Apr 2026 08:09:40 -0700
Subject: [PATCH 1/8] Update dsv4 b300 add configs

---
 .github/configs/nvidia-master.yaml           | 12 +++++-------
 benchmarks/single_node/dsv4_fp4_b300_vllm.sh |  9 ++++++++-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 42c720a63..17843f847 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2477,17 +2477,15 @@ dsv4-fp4-b300-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 4 }
     - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 128, conc-end: 128 }
-    - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
+    - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 4096 }
+    - { tp: 8, dp-attn: true, conc-start: 2048, conc-end: 8192 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, conc-start: 4, conc-end: 4 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 128, conc-end: 128 }
-    - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 4, dp-attn: true, conc-start: 128, conc-end: 1024 }
+    - { tp: 8, dp-attn: true, conc-start: 1024, conc-end: 8192 }
 
 qwen3.5-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.9-cu129-amd64
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index 52f38c4d9..5eb6b2ad2 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -38,6 +38,13 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
 fi
 
+# DP mode: mbt=ISL; TP mode: mbt=2*ISL; floor at 2048
+if [ "${DP_ATTENTION}" = "true" ]; then
+    MAX_NUM_BATCHED_TOKENS=$(( ISL < 2048 ? 2048 : ISL ))
+else
+    MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 < 2048 ? 2048 : ISL * 2 ))
+fi
+
 BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
 if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then
     BENCHMARK_MAX_MODEL_LEN=4096
@@ -71,7 +78,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     --reasoning-parser deepseek_v4 \
     --max-cudagraph-capture-size 2048 \
     --max-model-len "$SERVE_MAX_MODEL_LEN" \
-    --max-num-batched-tokens 2048 > "$SERVER_LOG" 2>&1 &
+    --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" > "$SERVER_LOG" 2>&1 &
 
 SERVER_PID=$!
 

From 9043b4abbd4ab261bbed015842b5b904dd5abe75 Mon Sep 17 00:00:00 2001
From: wzhao18 <wzhao18.sz@gmail.com>
Date: Sat, 25 Apr 2026 08:37:40 -0700
Subject: [PATCH 2/8] Add perf changelog

---
 perf-changelog.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 397da6591..29595ecd7 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1819,3 +1819,12 @@
     - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132
+
+- config-keys:
+    - dsv4-fp4-b300-vllm
+  description:
+    - "Update search space based on B300 pareto sweep results"
+    - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192"
+    - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155
+

From bc2be9cc374e3da13b4b7d04286a80062fe51801 Mon Sep 17 00:00:00 2001
From: wzhao18 <wzhao18.sz@gmail.com>
Date: Sat, 25 Apr 2026 09:33:13 -0700
Subject: [PATCH 3/8] Update

---
 .github/configs/nvidia-master.yaml           |  8 ++++----
 benchmarks/single_node/dsv4_fp4_b300_vllm.sh | 12 ++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 17843f847..0d06dd76e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2478,14 +2478,14 @@ dsv4-fp4-b300-vllm:
     osl: 1024
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 4, dp-attn: true, conc-start: 256, conc-end: 4096 }
-    - { tp: 8, dp-attn: true, conc-start: 2048, conc-end: 8192 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 4096 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 8192 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 4, dp-attn: true, conc-start: 128, conc-end: 1024 }
-    - { tp: 8, dp-attn: true, conc-start: 1024, conc-end: 8192 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 1024 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 8192 }
 
 qwen3.5-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.9-cu129-amd64
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index 5eb6b2ad2..f0ed31cf3 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -1,9 +1,8 @@
 #!/usr/bin/env bash
 
 # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300
-# pareto sweep. The matrix uses dp-attn=true as the existing switch to flip a
-# 4-GPU run from TP4 to DP4. Expert parallel is always enabled to match the
-# provided vllm serve command exactly.
+# pareto sweep. TP mode (dp-attn=false) runs without expert parallel; DP mode
+# (dp-attn=true) enables expert parallel (EP_SIZE=TP value = DP size).
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
@@ -38,6 +37,11 @@ if [ "${DP_ATTENTION}" = "true" ]; then
     PARALLEL_ARGS=(--tensor-parallel-size 1 --data-parallel-size "$TP")
 fi
 
+EP_ARGS=()
+if [ "${EP_SIZE:-1}" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
 # DP mode: mbt=ISL; TP mode: mbt=2*ISL; floor at 2048
 if [ "${DP_ATTENTION}" = "true" ]; then
     MAX_NUM_BATCHED_TOKENS=$(( ISL < 2048 ? 2048 : ISL ))
@@ -69,7 +73,7 @@ vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
     --trust-remote-code \
     --block-size 256 \
     --no-enable-prefix-caching \
-    --enable-expert-parallel \
+    "${EP_ARGS[@]}" \
     --compilation-config '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' \
     --attention_config.use_fp4_indexer_cache True \
     --tokenizer-mode deepseek_v4 \

From bc87ac82c8f3c20eddab01a78000fe23b52bc7de Mon Sep 17 00:00:00 2001
From: Wei Zhao <51183510+wzhao18@users.noreply.github.com>
Date: Sat, 25 Apr 2026 17:18:31 -0400
Subject: [PATCH 4/8] Update conc-end value in nvidia-master.yaml

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0d06dd76e..64f609b3e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2485,7 +2485,7 @@ dsv4-fp4-b300-vllm:
     search-space:
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 1024 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 8192 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 1024, conc-end: 4096 }
 
 qwen3.5-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.9-cu129-amd64

From c5d88fc96106cf2b9ed8484563fdd4b41cab71ae Mon Sep 17 00:00:00 2001
From: Wei Zhao <51183510+wzhao18@users.noreply.github.com>
Date: Sat, 25 Apr 2026 20:07:28 -0400
Subject: [PATCH 5/8] Update dsv4_fp4_b300_vllm.sh

---
 benchmarks/single_node/dsv4_fp4_b300_vllm.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index f0ed31cf3..e8444ea4c 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -42,9 +42,8 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then
     EP_ARGS=(--enable-expert-parallel)
 fi
 
-# DP mode: mbt=ISL; TP mode: mbt=2*ISL; floor at 2048
 if [ "${DP_ATTENTION}" = "true" ]; then
-    MAX_NUM_BATCHED_TOKENS=$(( ISL < 2048 ? 2048 : ISL ))
+    MAX_NUM_BATCHED_TOKENS=2048
 else
     MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 < 2048 ? 2048 : ISL * 2 ))
 fi

From 457398a73f4af3e32d293042ea79c712a83ebe50 Mon Sep 17 00:00:00 2001
From: Wei Zhao <51183510+wzhao18@users.noreply.github.com>
Date: Sat, 25 Apr 2026 20:15:53 -0400
Subject: [PATCH 6/8] Simplify MAX_NUM_BATCHED_TOKENS assignment

---
 benchmarks/single_node/dsv4_fp4_b300_vllm.sh | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index e8444ea4c..fb6518601 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -42,11 +42,7 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then
     EP_ARGS=(--enable-expert-parallel)
 fi
 
-if [ "${DP_ATTENTION}" = "true" ]; then
-    MAX_NUM_BATCHED_TOKENS=2048
-else
-    MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 < 2048 ? 2048 : ISL * 2 ))
-fi
+MAX_NUM_BATCHED_TOKENS=2048
 
 BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
 if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then

From 0ed17a1ddfc2249298ee8d56e707bdbd794a459f Mon Sep 17 00:00:00 2001
From: Wei Zhao <51183510+wzhao18@users.noreply.github.com>
Date: Sun, 26 Apr 2026 00:26:19 -0400
Subject: [PATCH 7/8] Modify MAX_NUM_BATCHED_TOKENS based on DP_ATTENTION

---
 benchmarks/single_node/dsv4_fp4_b300_vllm.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index fb6518601..6bb5b9049 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -42,7 +42,11 @@ if [ "${EP_SIZE:-1}" -gt 1 ]; then
     EP_ARGS=(--enable-expert-parallel)
 fi
 
-MAX_NUM_BATCHED_TOKENS=2048
+if [ "${DP_ATTENTION}" = "true" ]; then
+    MAX_NUM_BATCHED_TOKENS=2048
+else
+    MAX_NUM_BATCHED_TOKENS=$(( ISL * 2 ))
+fi
 
 BENCHMARK_MAX_MODEL_LEN="$MAX_MODEL_LEN"
 if [ "$ISL" -eq 1024 ] && [ "$OSL" -eq 1024 ]; then

From 427a963b5fff45b56407e4f075cb00be654c1066 Mon Sep 17 00:00:00 2001
From: Wei Zhao <51183510+wzhao18@users.noreply.github.com>
Date: Sun, 26 Apr 2026 00:28:01 -0400
Subject: [PATCH 8/8] Refactor benchmark configurations in perf-changelog.yaml

Removed several benchmark configurations and updated the search space for dsv4-fp4-b300-vllm based on recent results.
---
 perf-changelog.yaml | 27 ++-------------------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3c1042626..e5dc8c279 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1826,29 +1826,6 @@
     - "Retriggering dsv4-fp8-mi355x-sglang"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1148
 
-- config-keys:
-    - dsv4-fp4-gb200-dynamo-vllm
-  description:
-    - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (1k/1k sweep; 8k/1k currently commented out)"
-    - "Container: vllm/vllm-openai:deepseekv4-cu130; model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)"
-    - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern"
-    - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129
-  
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132
-
-- config-keys:
-    - dsv4-fp8-mi355x-sglang
-  description:
-    - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh"
-    - "Bump --chunked-prefill-size from 4096 to 8192"
-    - "Retrigger dsv4-fp8-mi355x-sglang"
-
 - config-keys:
     - dsv4-fp8-mi355x-sglang
   description:
@@ -1856,11 +1833,11 @@
     - "Bump --chunked-prefill-size from 4096 to 8192"
     - "Retrigger dsv4-fp8-mi355x-sglang"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160
-  
+
 - config-keys:
     - dsv4-fp4-b300-vllm
   description:
     - "Update search space based on B300 pareto sweep results"
     - "ISL=1024: TP4 conc 4-128; DP4 (dp-attn) conc 256-4096; DP8 (dp-attn) conc 2048-8192"
     - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155
\ No newline at end of file
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155