From 93db2e2b3f9f99ac86c7d2f28cc5b718b62661de Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 13:00:49 -0700
Subject: [PATCH 01/56] Day 0 DeepSeek V4 Pro FP4 GB200 disaggregated SGLang
 benchmarks

---
 .github/configs/nvidia-master.yaml            | 112 +++++++++++++++++
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    | 110 +++++++++++++++++
 .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 115 ++++++++++++++++++
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 111 +++++++++++++++++
 .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 106 ++++++++++++++++
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 109 +++++++++++++++++
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 110 +++++++++++++++++
 perf-changelog.yaml                           |   9 ++
 runners/launch_gb200-nv.sh                    |  16 +++
 9 files changed, 798 insertions(+)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 42c720a63..b2d361f65 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7666,3 +7666,115 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+
+dsv4-fp4-gb200-dynamo-sglang:
+  image: lmsysorg/sglang:deepseek-v4-grace-blackwell
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  # 1k/1k — hand-rolled. NVIDIA/srt-slurm has no DSV4 sglang disagg
+  # recipe yet; topologies match the dsv4-fp4-gb200-dynamo-vllm sibling
+  # so framework-level numbers are directly comparable. Per-worker
+  # tunings cross-reference benchmarks/single_node/dsv4_fp4_b200.sh and
+  # NVIDIA/srt-slurm@sa-submission-q2-2026 recipes/gb200-fp4/1k1k/*.yaml
+  # (DSR1 sglang disagg structure).
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes.
+    - conc-list: [1, 4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes.
+    - conc-list: [128, 256, 1024, 2048, 4096]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
+    # 4096 overlap with the 1p1d block gives a topology-crossover A/B.
+    - conc-list: [4096, 8192]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+
+  # 8k/1k block kept commented out — same rationale as the dsv4-fp4-
+  # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded.
+  # Uncomment to re-enable (recipes are already in place).
+  # - isl: 8192
+  #   osl: 1024
+  #   search-space:
+  #   # Low-concurrency: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes.
+  #   - conc-list: [1, 4, 8, 16, 32, 64]
+  #     prefill:
+  #       num-worker: 1
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+  #     decode:
+  #       num-worker: 1
+  #       tp: 8
+  #       ep: 1
+  #       dp-attn: false
+  #   # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
+  #   - conc-list: [512, 1024]
+  #     prefill:
+  #       num-worker: 3
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+  #     decode:
+  #       num-worker: 1
+  #       tp: 16
+  #       ep: 16
+  #       dp-attn: true
+  #   # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes.
+  #   - conc-list: [4096, 8192]
+  #     prefill:
+  #       num-worker: 7
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+  #     decode:
+  #       num-worker: 1
+  #       tp: 16
+  #       ep: 16
+  #       dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..6eecc801b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -0,0 +1,110 @@
+name: "dsv4-sglang-disagg-gb200-1p1d-dep8-dep16"
+
+# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the
+# upstream-reference list (PR #69 GB200 agg, PR #75 GB300 disagg).
+# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling.
+#
+# Topology: 1 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 6 nodes.
+# Single prefill is enough for 1k prompts up to ~conc 4096 (per-rank
+# prefill TFlops at 1k ISL is high; matches the vLLM sibling sizing).
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+dynamo:
+  version: 0.8.1
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+  connector: null
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 3072
+      max-running-requests: 16
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 3072
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x1024x2048x4096"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
new file mode 100644
index 000000000..5c44400e3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -0,0 +1,115 @@
+name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8"
+
+# Hand-rolled — no GB200 DSV4 sglang disagg recipe exists upstream. The
+# closest references on NVIDIA/srt-slurm are:
+#   * PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml) —
+#     GB200 DSV4 sglang AGGREGATED: per-worker flag set + env vars.
+#   * PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml) —
+#     GB300 DSV4 sglang DISAGG: confirms nixl + flashinfer_mxfp4 +
+#     chunked-prefill-size=4096 + disable-flashinfer-autotune.
+# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross-
+# framework numbers stay directly comparable.
+#
+# Topology: 1 prefill (DP=8 EP=8) + 1 decode (TP=8, no DP-attn). 4 nodes.
+# Targets very low concurrency (1-64) where TP-sharded decode gives the
+# best per-user latency.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+dynamo:
+  version: 0.8.1
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+  connector: null
+
+  # Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline
+  # that's actually been run upstream) plus the disaggregation timeout
+  # triple — heartbeat 100k matches the DSR1 sglang disagg convention.
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 3072
+      max-running-requests: 16
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 3072
+      max-running-requests: 64
+      cuda-graph-max-bs: 64
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..bb61350b2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -0,0 +1,111 @@
+name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
+
+# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the
+# upstream-reference list. Topology mirrors the dsv4-fp4-gb200-dynamo-
+# vllm sibling.
+#
+# Topology: 3 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 10 nodes.
+# Sized for conc 4096-8192 — at those concurrencies a single prefill
+# worker (the 1p1d-dep8-dep16 sibling) becomes the bottleneck since the
+# 1k prefill arrival rate exceeds what one DP=8 worker can sustain.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+dynamo:
+  version: 0.8.1
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+  connector: null
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 3072
+      max-running-requests: 16
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 3072
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4096x8192"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
new file mode 100644
index 000000000..abe23d2dd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -0,0 +1,106 @@
+name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8"
+
+# 8k/1k variant of the 1k/1k 1p1d-dep8-tep8 recipe. Same topology and
+# tuning; only context-length grows from 3072 (1k+1k+pad) to 9280
+# (8k+1k+pad), and prefill max-running-requests halves to keep the per-
+# rank prefill working set inside the GPU memory budget.
+#
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full upstream-
+# reference list (PR #69 GB200 agg, PR #75 GB300 disagg).
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+dynamo:
+  version: 0.8.1
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+  connector: null
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 9280
+      max-running-requests: 8
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 9280
+      max-running-requests: 64
+      cuda-graph-max-bs: 64
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..bdbfaa735
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -0,0 +1,109 @@
+name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
+
+# 8k/1k mid-throughput topology: 3 prefill (DP=8 EP=8) + 1 wide decode
+# (DP=16 EP=16). 10 nodes. Targets conc 512-1024 — 8k prompts saturate
+# a single prefill worker below conc=512.
+#
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference
+# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+dynamo:
+  version: 0.8.1
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+  connector: null
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 9280
+      max-running-requests: 4
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 9280
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..de9bd45df
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -0,0 +1,110 @@
+name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
+
+# 8k/1k max-throughput topology: 7 prefill (DP=8 EP=8) + 1 wide decode
+# (DP=16 EP=16). 18 nodes — full GB200 cluster. Targets conc 4096-8192.
+# Per-worker tunings identical to the 3p1d sibling; only prefill_workers
+# and prefill_nodes scale up.
+#
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference
+# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+dynamo:
+  version: 0.8.1
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 14
+  decode_nodes: 4
+  prefill_workers: 7
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+  connector: null
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 9280
+      max-running-requests: 4
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      tensor-parallel-size: 16
+      dp-size: 16
+      ep-size: 16
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      disable-radix-cache: true
+      mem-fraction-static: 0.82
+      context-length: 9280
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+      stream-interval: 50
+      decode-log-interval: 1000
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096x8192"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 397da6591..45bc466fc 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1819,3 +1819,12 @@
     - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132
+
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-sglang
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)"
+    - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)"
+    - "Topologies mirror the dsv4-fp4-gb200-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B"
+    - "No upstream GB200 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB200 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 224c3a928..08897874e 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -15,6 +15,12 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
     elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/"
         export SRT_SLURM_MODEL_PREFIX="dsr1-fp4"
+    elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+        # Same compute-node-local NVMe path as the dynamo-vllm dsv4
+        # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX
+        # matches the model.path alias in our DSV4 sglang recipes.
+        export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
+        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
     else
         export MODEL_PATH=$MODEL
     fi
@@ -150,6 +156,16 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
     # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case).
     mkdir -p recipes/vllm/deepseek-v4
     cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
+elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then
+    # Mirrors the dynamo-vllm dsv4 branch above: pin to the q2-2026
+    # NVIDIA srt-slurm (newer srtctl + dynamo-sglang container alias)
+    # and overlay our hand-rolled DSV4 sglang recipes. NVIDIA/srt-slurm
+    # has no upstream sglang DSV4 disagg recipes yet, hence the overlay.
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    git checkout sa-submission-q2-2026
+    mkdir -p recipes/sglang/deepseek-v4
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4
 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"

From 1bc4c2e6929d098456e11557c5c0fb86423bad48 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 13:35:16 -0700
Subject: [PATCH 02/56] Drop unsupported backend.connector field from sglang
 recipes

srtctl SrtConfig schema rejects backend.connector for the sglang
backend type. The field was carried over from the dynamo-vllm dsv4
recipes (where it is valid and set to null). PR #69/#75 sglang
recipes upstream do not declare it.
---
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    | 1 -
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 1 -
 .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 1 -
 .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 1 -
 .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 1 -
 .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 1 -
 6 files changed, 6 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 6eecc801b..6a78c476a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -39,7 +39,6 @@ frontend:
 
 backend:
   type: sglang
-  connector: null
 
   prefill_environment:
     PYTHONUNBUFFERED: "1"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 5c44400e3..3da368c17 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -45,7 +45,6 @@ frontend:
 
 backend:
   type: sglang
-  connector: null
 
   # Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline
   # that's actually been run upstream) plus the disaggregation timeout
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index bb61350b2..12b1207bb 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -40,7 +40,6 @@ frontend:
 
 backend:
   type: sglang
-  connector: null
 
   prefill_environment:
     PYTHONUNBUFFERED: "1"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index abe23d2dd..54debefef 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -39,7 +39,6 @@ frontend:
 
 backend:
   type: sglang
-  connector: null
 
   prefill_environment:
     PYTHONUNBUFFERED: "1"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index bdbfaa735..f377c803e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -38,7 +38,6 @@ frontend:
 
 backend:
   type: sglang
-  connector: null
 
   prefill_environment:
     PYTHONUNBUFFERED: "1"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index de9bd45df..53b7661d6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -39,7 +39,6 @@ frontend:
 
 backend:
   type: sglang
-  connector: null
 
   prefill_environment:
     PYTHONUNBUFFERED: "1"

From 65b8b1711de84af4c253df12512b1638108abb46 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 14:05:08 -0700
Subject: [PATCH 03/56] =?UTF-8?q?Drop=20dynamo:=20version:=200.8.1=20?=
 =?UTF-8?q?=E2=80=94=20incompatible=20with=20deepseek-v4-grace-blackwell?=
 =?UTF-8?q?=20sglang=20fork?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-installing dynamo 0.8.1 over the lmsysorg/sglang:deepseek-v4-grace-blackwell
container's pre-baked sglang fails at import time:

    File ".../dynamo/sglang/health_check.py", line 20
      def _get_bos_token_id_from_engine(engine: Optional[sgl.Engine])
    AttributeError: module 'sglang' has no attribute 'Engine'

The DSV4 sglang fork bundled in this image does not expose sgl.Engine.
Drop the dynamo: block so srtctl uses the dynamo build pre-installed in
the container — matches NVIDIA/srt-slurm PR #75 (the only upstream
DSV4 sglang disagg recipe), which also has no dynamo: block.
---
 .../deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml   | 7 +++++--
 .../deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 9 +++++++--
 .../deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 7 +++++--
 .../deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 7 +++++--
 .../deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 7 +++++--
 .../deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml   | 7 +++++--
 6 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 6a78c476a..f497da7fc 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -13,8 +13,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-dynamo:
-  version: 0.8.1
+# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the
+# rationale. srtctl uses the dynamo build baked into the
+# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
+# dynamo 0.8.1 on top breaks startup with `AttributeError: module
+# 'sglang' has no attribute 'Engine'`.
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 3da368c17..f616b553d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -19,8 +19,13 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-dynamo:
-  version: 0.8.1
+# No `dynamo:` block — srtctl skips the dynamo pip install and uses the
+# dynamo build baked into the lmsysorg/sglang:deepseek-v4-grace-blackwell
+# image. dynamo 0.8.1 (the version pinned by upstream DSR1 sglang
+# recipes) imports `sgl.Engine`, which this image's sglang fork does not
+# expose, so re-installing it breaks startup with `AttributeError:
+# module 'sglang' has no attribute 'Engine'`. PR #75 (the only upstream
+# DSV4 sglang disagg recipe) follows the same pattern.
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 12b1207bb..e382271b8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -14,8 +14,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-dynamo:
-  version: 0.8.1
+# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the
+# rationale. srtctl uses the dynamo build baked into the
+# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
+# dynamo 0.8.1 on top breaks startup with `AttributeError: module
+# 'sglang' has no attribute 'Engine'`.
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 54debefef..226565d55 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -13,8 +13,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-dynamo:
-  version: 0.8.1
+# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
+# the rationale. srtctl uses the dynamo build baked into the
+# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
+# dynamo 0.8.1 on top breaks startup with `AttributeError: module
+# 'sglang' has no attribute 'Engine'`.
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index f377c803e..6bb69816c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -12,8 +12,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-dynamo:
-  version: 0.8.1
+# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
+# the rationale. srtctl uses the dynamo build baked into the
+# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
+# dynamo 0.8.1 on top breaks startup with `AttributeError: module
+# 'sglang' has no attribute 'Engine'`.
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 53b7661d6..311482e37 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -13,8 +13,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-dynamo:
-  version: 0.8.1
+# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
+# the rationale. srtctl uses the dynamo build baked into the
+# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
+# dynamo 0.8.1 on top breaks startup with `AttributeError: module
+# 'sglang' has no attribute 'Engine'`.
 
 slurm:
   time_limit: "8:00:00"

From 9d883ba0d474fb76c022f286ee30bd59e6413802 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 14:11:23 -0700
Subject: [PATCH 04/56] =?UTF-8?q?Add=20dynamo:=20install:=20false=20?=
 =?UTF-8?q?=E2=80=94=20srtctl=20default=20is=20install=3DTrue?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

srtctl's DynamoConfig (src/srtctl/core/schema.py L680) defaults to
install=True, which pip installs dynamo 0.8.0 even when no `dynamo:`
block is specified. Use the explicit opt-out so srtctl uses the dynamo
build baked into the lmsysorg/sglang:deepseek-v4-grace-blackwell
image. This image's sglang fork doesn't expose sgl.Engine, which
dynamo.sglang.health_check imports at top level — re-installing
dynamo over it breaks startup.
---
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml       | 10 +++++-----
 .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml        | 16 +++++++++-------
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml       | 10 +++++-----
 .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml        | 10 +++++-----
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml       | 10 +++++-----
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml       | 10 +++++-----
 6 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index f497da7fc..29f10cd1b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -13,11 +13,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the
-# rationale. srtctl uses the dynamo build baked into the
-# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
-# dynamo 0.8.1 on top breaks startup with `AttributeError: module
-# 'sglang' has no attribute 'Engine'`.
+# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the
+# rationale (srtctl defaults to installing dynamo 0.8.0, but that
+# breaks against the deepseek-v4-grace-blackwell sglang fork).
+dynamo:
+  install: false
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index f616b553d..e2cb204d9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -19,13 +19,15 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# No `dynamo:` block — srtctl skips the dynamo pip install and uses the
-# dynamo build baked into the lmsysorg/sglang:deepseek-v4-grace-blackwell
-# image. dynamo 0.8.1 (the version pinned by upstream DSR1 sglang
-# recipes) imports `sgl.Engine`, which this image's sglang fork does not
-# expose, so re-installing it breaks startup with `AttributeError:
-# module 'sglang' has no attribute 'Engine'`. PR #75 (the only upstream
-# DSV4 sglang disagg recipe) follows the same pattern.
+# `install: false` is required: srtctl's DynamoConfig defaults to
+# install=True (pip installs dynamo 0.8.0 from PyPI). dynamo's
+# `dynamo.sglang.health_check` module imports `sgl.Engine` at top
+# level, which the lmsysorg/sglang:deepseek-v4-grace-blackwell image's
+# sglang fork does not expose — re-installing dynamo breaks startup
+# with `AttributeError: module 'sglang' has no attribute 'Engine'`.
+# Use whatever dynamo build is already baked into the container.
+dynamo:
+  install: false
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index e382271b8..1c978deac 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -14,11 +14,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the
-# rationale. srtctl uses the dynamo build baked into the
-# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
-# dynamo 0.8.1 on top breaks startup with `AttributeError: module
-# 'sglang' has no attribute 'Engine'`.
+# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the
+# rationale (srtctl defaults to installing dynamo 0.8.0, but that
+# breaks against the deepseek-v4-grace-blackwell sglang fork).
+dynamo:
+  install: false
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 226565d55..e2c15c775 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -13,11 +13,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
-# the rationale. srtctl uses the dynamo build baked into the
-# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
-# dynamo 0.8.1 on top breaks startup with `AttributeError: module
-# 'sglang' has no attribute 'Engine'`.
+# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
+# the rationale (srtctl defaults to installing dynamo 0.8.0, but that
+# breaks against the deepseek-v4-grace-blackwell sglang fork).
+dynamo:
+  install: false
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 6bb69816c..ddd061174 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -12,11 +12,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
-# the rationale. srtctl uses the dynamo build baked into the
-# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
-# dynamo 0.8.1 on top breaks startup with `AttributeError: module
-# 'sglang' has no attribute 'Engine'`.
+# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
+# the rationale (srtctl defaults to installing dynamo 0.8.0, but that
+# breaks against the deepseek-v4-grace-blackwell sglang fork).
+dynamo:
+  install: false
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 311482e37..10dd11da0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -13,11 +13,11 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
-# the rationale. srtctl uses the dynamo build baked into the
-# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing
-# dynamo 0.8.1 on top breaks startup with `AttributeError: module
-# 'sglang' has no attribute 'Engine'`.
+# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
+# the rationale (srtctl defaults to installing dynamo 0.8.0, but that
+# breaks against the deepseek-v4-grace-blackwell sglang fork).
+dynamo:
+  install: false
 
 slurm:
   time_limit: "8:00:00"

From 1b75dd7c4e122b21142ec3b12a6353da61d7229b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 14:39:18 -0700
Subject: [PATCH 05/56] Pin dynamo to v1.2.0-sglang-deepseek-v4-dev.1 tag (hash
 21f135f5)

install: false fixed the pip-install crash, but the
lmsysorg/sglang:deepseek-v4-grace-blackwell image doesn't have dynamo
pre-installed (ModuleNotFoundError: No module named 'dynamo'), so
srtctl needs to install something compatible.

The DSV4-targeted dynamo tag v1.2.0-sglang-deepseek-v4-dev.1 (sha
21f135f5edf40e12e6ff5db2b462d862a6d6ab9b) includes
'from __future__ import annotations' in dynamo/sglang/health_check.py
(ai-dynamo PR #7255, commit cdb7218a, 2026-03-12), which makes the
Optional[sgl.Engine] annotation lazy. The PyPI 0.8.0/0.8.1 releases
predate that fix and crash with AttributeError on this image's
sglang fork.
---
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    |  7 +++----
 .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 19 +++++++++++--------
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    |  7 +++----
 .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml     |  7 +++----
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml    |  7 +++----
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    |  7 +++----
 6 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 29f10cd1b..06e692e67 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -13,11 +13,10 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the
-# rationale (srtctl defaults to installing dynamo 0.8.0, but that
-# breaks against the deepseek-v4-grace-blackwell sglang fork).
+# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
 dynamo:
-  install: false
+  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
+  install: true
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index e2cb204d9..e7c639c2a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -19,15 +19,18 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# `install: false` is required: srtctl's DynamoConfig defaults to
-# install=True (pip installs dynamo 0.8.0 from PyPI). dynamo's
-# `dynamo.sglang.health_check` module imports `sgl.Engine` at top
-# level, which the lmsysorg/sglang:deepseek-v4-grace-blackwell image's
-# sglang fork does not expose — re-installing dynamo breaks startup
-# with `AttributeError: module 'sglang' has no attribute 'Engine'`.
-# Use whatever dynamo build is already baked into the container.
+# Pin dynamo to the v1.2.0-sglang-deepseek-v4-dev.1 tag. The PyPI
+# 0.8.0/0.8.1 releases (srtctl's default) reference `sgl.Engine` in
+# `dynamo.sglang.health_check` *eagerly* (no `from __future__ import
+# annotations`), and the lmsysorg/sglang:deepseek-v4-grace-blackwell
+# image's sglang fork does not expose `sgl.Engine`, so they crash at
+# import with `AttributeError: module 'sglang' has no attribute
+# 'Engine'`. The DSV4-targeted tag adds `from __future__ import
+# annotations` (commit cdb7218a, ai-dynamo PR #7255), making the
+# annotation lazy so the module imports cleanly.
 dynamo:
-  install: false
+  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
+  install: true
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 1c978deac..3011347db 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -14,11 +14,10 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the
-# rationale (srtctl defaults to installing dynamo 0.8.0, but that
-# breaks against the deepseek-v4-grace-blackwell sglang fork).
+# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
 dynamo:
-  install: false
+  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
+  install: true
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index e2c15c775..61e024a14 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -13,11 +13,10 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
-# the rationale (srtctl defaults to installing dynamo 0.8.0, but that
-# breaks against the deepseek-v4-grace-blackwell sglang fork).
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
 dynamo:
-  install: false
+  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
+  install: true
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index ddd061174..7338cdaf3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -12,11 +12,10 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
-# the rationale (srtctl defaults to installing dynamo 0.8.0, but that
-# breaks against the deepseek-v4-grace-blackwell sglang fork).
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
 dynamo:
-  install: false
+  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
+  install: true
 
 slurm:
   time_limit: "8:00:00"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 10dd11da0..111f9e435 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -13,11 +13,10 @@ model:
   container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
   precision: "fp4"
 
-# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for
-# the rationale (srtctl defaults to installing dynamo 0.8.0, but that
-# breaks against the deepseek-v4-grace-blackwell sglang fork).
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
 dynamo:
-  install: false
+  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
+  install: true
 
 slurm:
   time_limit: "8:00:00"

From eb3f62c3dbf734fa5ed54d8e73a538e89453b186 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 15:40:48 -0700
Subject: [PATCH 06/56] Force deepep-mode: low_latency to work around
 mxfp4+DeepEP normal-dispatch bug

Prefill warmup crashed in run 24941291328 with:

  File ".../sglang/srt/layers/quantization/mxfp4_deepseek.py", line 347
    topk_output = dispatch_output.topk_output
  AttributeError: 'DeepEPNormalDispatchOutput' object has no attribute 'topk_output'

Per sglang server_args.py, --deepep-mode defaults to 'auto', which
picks 'normal' for prefill batches and 'low_latency' for decode. The
mxfp4_deepseek MoE kernel only handles the low_latency dispatch
output shape (which carries topk_output); the normal-dispatch output
type does not, so any prefill forward (or decode warmup using
forward_idle) hits the AttributeError before the worker can serve.

Force deepep-mode: low_latency on every prefill + decode block that
uses moe-a2a-backend: deepep. The two 1p1d-dep8-tep8 decode blocks
remain TP-only (no DeepEP) and are unaffected.

Run reference: https://github.com/SemiAnalysisAI/InferenceX/actions/runs/24941291328
---
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 1 +
 .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 1 +
 .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml   | 2 ++
 6 files changed, 10 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 06e692e67..f6e0144c0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -69,6 +69,7 @@ backend:
       ep-size: 8
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -89,6 +90,7 @@ backend:
       ep-size: 16
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index e7c639c2a..4a56f1556 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -86,6 +86,7 @@ backend:
       ep-size: 8
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 3011347db..c676f1618 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -70,6 +70,7 @@ backend:
       ep-size: 8
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -90,6 +91,7 @@ backend:
       ep-size: 16
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 61e024a14..e15e24d12 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -69,6 +69,7 @@ backend:
       ep-size: 8
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 7338cdaf3..290d600ef 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -68,6 +68,7 @@ backend:
       ep-size: 8
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -88,6 +89,7 @@ backend:
       ep-size: 16
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 111f9e435..05f289815 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -69,6 +69,7 @@ backend:
       ep-size: 8
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -89,6 +90,7 @@ backend:
       ep-size: 16
       enable-dp-attention: true
       moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true

From 6c608dfa33451789fca8115f7c4e475b608162a2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 16:02:31 -0700
Subject: [PATCH 07/56] =?UTF-8?q?Drop=20DeepEP=20/=20DP-attn=20/=20EP=20?=
 =?UTF-8?q?=E2=80=94=20fork-only=20mxfp4=5Fdeepseek=20bug,=20both=20dispat?=
 =?UTF-8?q?ch=20types=20broken?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run after the deepep-mode: low_latency change failed again. Logs show
two distinct DeepEP-path failures:

1. Prefill scheduler crash:
     File '.../sglang/srt/layers/quantization/mxfp4_deepseek.py', line 347
       topk_output = dispatch_output.topk_output
     AttributeError: 'DeepEPLLDispatchOutput' object has no attribute 'topk_output'
   The earlier crash had 'DeepEPNormalDispatchOutput' — neither dispatch
   output type in this image's sglang fork exposes topk_output, so
   forcing low_latency vs normal mode does not help. mxfp4_deepseek.py
   is a fork-only file (does not exist in upstream sgl-project/sglang),
   so the API mismatch can only be fixed by rebuilding the image.

2. Decode CUDA graph capture crash:
     RuntimeError: Failed: Assertion error /sgl-workspace/DeepEP/csrc/deep_ep.cpp:1233
       'x.size(0) == topk_idx.size(0) and x.size(0) <= num_max_dispatch_tokens_per_rank'
   DeepEP low_latency_dispatch's per-rank token cap is exceeded by the
   cuda-graph-max-bs we configured.

Both failures are in the DeepEP path. Per upstream sgl-project/sglang
(server_args.py), moe_a2a_backend defaults to 'none', which uses
all-reduce/all-gather dispatch and lets TP shard the expert weights
across ranks (no separate EP needed). NVIDIA/srt-slurm PR #75 (the
only upstream DSV4 sglang disagg recipe) takes the same TP-only stance
— pure tensor-parallel-size: N with no enable-dp-attention, no
moe-a2a-backend deepep, no dp-size, no ep-size.

Drop those five fields from all 6 recipes. Topology shape preserved:
- 1k1k 1p1d: P TP=8 / D TP=8 (4 nodes)
- 1k1k 1p1d-wide: P TP=8 / D TP=16 (6 nodes)
- 1k1k 3p1d-wide: P 3*TP=8 / D TP=16 (10 nodes)
- 8k1k 1p1d: P TP=8 / D TP=8 (4 nodes)
- 8k1k 3p1d-wide: P 3*TP=8 / D TP=16 (10 nodes)
- 8k1k 7p1d-wide: P 7*TP=8 / D TP=16 (18 nodes)

DSV4-Pro at MXFP4 (~340 GB) shards comfortably under TP=8 (~42 GB/rank)
or TP=16 (~21 GB/rank) with mem-fraction-static: 0.82 leaving plenty of
KV cache headroom on each 96 GB GB200 GPU.

Topology filenames retain the 'dep8' / 'dep16' historical names from
the vLLM PR #1129 sibling for symmetry — the actual sglang_config is
TP-only.
---
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    | 10 --------
 .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 23 ++++++++++++-------
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 10 --------
 .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml     |  5 ----
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 10 --------
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 10 --------
 6 files changed, 15 insertions(+), 53 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index f6e0144c0..33f33fa92 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -65,11 +65,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
-      dp-size: 8
-      ep-size: 8
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -86,11 +81,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 16
-      dp-size: 16
-      ep-size: 16
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 4a56f1556..917d26dc6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -10,9 +10,21 @@ name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8"
 # Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross-
 # framework numbers stay directly comparable.
 #
-# Topology: 1 prefill (DP=8 EP=8) + 1 decode (TP=8, no DP-attn). 4 nodes.
-# Targets very low concurrency (1-64) where TP-sharded decode gives the
-# best per-user latency.
+# Topology: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. Targets very
+# low concurrency (1-64).
+#
+# Why TP-only (no DeepEP, no DP-attention, no EP): the
+# lmsysorg/sglang:deepseek-v4-grace-blackwell image's sglang fork ships
+# a fork-only quant kernel `mxfp4_deepseek.py` (does not exist in
+# upstream sgl-project/sglang) that reads `dispatch_output.topk_output`
+# at line 347. Neither `DeepEPNormalDispatchOutput` nor
+# `DeepEPLLDispatchOutput` exposes that field in this fork, so any
+# `forward_deepep` path in disagg crashes the prefill scheduler. PR #75
+# (the only upstream DSV4 sglang disagg recipe) takes the same TP-only
+# stance — defaults to `moe_a2a_backend="none"` (sglang server_args.py)
+# and lets TP shard the expert weights instead of sharding via EP.
+# We can re-introduce EP/DeepEP once the fork's mxfp4_deepseek dispatch
+# API mismatch is fixed.
 
 model:
   path: "deepseek-v4-pro"
@@ -82,11 +94,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
-      dp-size: 8
-      ep-size: 8
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index c676f1618..5049d6f7d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -66,11 +66,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
-      dp-size: 8
-      ep-size: 8
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -87,11 +82,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 16
-      dp-size: 16
-      ep-size: 16
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index e15e24d12..2cf890688 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -65,11 +65,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
-      dp-size: 8
-      ep-size: 8
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 290d600ef..6b4cb46ab 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -64,11 +64,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
-      dp-size: 8
-      ep-size: 8
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -85,11 +80,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 16
-      dp-size: 16
-      ep-size: 16
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 05f289815..fc9790730 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -65,11 +65,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
-      dp-size: 8
-      ep-size: 8
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -86,11 +81,6 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 16
-      dp-size: 16
-      ep-size: 16
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true

From 2bb3ef073a5ae669dd4f2896947ea5c6bbbbd195 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 16:21:08 -0700
Subject: [PATCH 08/56] =?UTF-8?q?Add=20moe-dense-tp-size:=201=20=E2=80=94?=
 =?UTF-8?q?=20fix=20shared-experts=20FP8=20block-quant=20divisibility=20at?=
 =?UTF-8?q?=20TP=3D8/16?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the DeepEP removal, model load crashed at:

  File '.../sglang/srt/layers/quantization/fp8.py', line 282, in validate_block_quant_shapes
    raise ValueError(
  ValueError: Weight output_partition_size = 192 is not divisible
              by weight quantization block_n = 128.

DSV4-Pro's shared-experts gate_up_proj (intermediate ~1536) FP8-quants
in 128-element blocks. With TP=8 the per-rank slice is 1536/8=192,
which fails the divisibility check. PR #75 sidesteps this by using
TP=4 (1536/4=384), but that locks us into single-node workers.

sglang's --moe-dense-tp-size flag is the documented workaround
(server_args.py: 'useful when, with large TP size, there are errors
caused by weights in MLP layers having dimension smaller than the
min dimension GEMM supports'). Setting moe-dense-tp-size: 1 runs the
shared / dense-MLP layers replicated across ranks (TP=1) while the
rest of the model — attention, routed experts — keeps TP=8/16. Memory
cost is small since shared experts are a fraction of total weights.

Applied to all 6 recipes; topology/node counts unchanged.
---
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 2 ++
 .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml   | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 33f33fa92..7081919fc 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -65,6 +65,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -81,6 +82,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 16
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 917d26dc6..6c7df35e4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -94,6 +94,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -110,6 +111,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 5049d6f7d..9ddf19ee7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -66,6 +66,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -82,6 +83,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 16
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 2cf890688..4112e4244 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -65,6 +65,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -81,6 +82,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 6b4cb46ab..d9f43773f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -64,6 +64,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -80,6 +81,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 16
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index fc9790730..5887e85b1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -65,6 +65,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 8
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -81,6 +82,7 @@ backend:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
       tensor-parallel-size: 16
+      moe-dense-tp-size: 1
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true

From d34d894ef814cc5eb584d821c4bff1cd95d10a85 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 16:24:04 -0700
Subject: [PATCH 09/56] Set SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1024
 in all env blocks

Belt-and-suspenders for the DeepEP per-rank dispatch buffer cap. The
default is too low; with this set we'll have headroom if EP / DeepEP
is re-enabled later (e.g., once the fork's mxfp4_deepseek dispatch API
mismatch is fixed). 1024 matches the cookbook's B200 decode reference.
---
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 2 ++
 .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml   | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 7081919fc..4a6397649 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -50,6 +50,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -59,6 +60,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 6c7df35e4..cc67a2cb6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -79,6 +79,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -88,6 +89,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 9ddf19ee7..6a4258a8a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -51,6 +51,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -60,6 +61,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 4112e4244..8024a769f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -50,6 +50,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -59,6 +60,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index d9f43773f..4d997ec99 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -49,6 +49,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -58,6 +59,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 5887e85b1..ac26318aa 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -50,6 +50,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -59,6 +60,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:

From c24f25bf4772f81f4bf48529f51a8254b92c7069 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 16:42:07 -0700
Subject: [PATCH 10/56] =?UTF-8?q?Switch=20to=20TP=3D4=20single-node=20?=
 =?UTF-8?q?=E2=80=94=20match=20PR=20#75=20verbatim,=20fix=20FP8=20block-qu?=
 =?UTF-8?q?ant?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run after moe-dense-tp-size: 1 added still hit:
  ValueError: Weight output_partition_size = 192 is not divisible
              by weight quantization block_n = 128.

Verified in upstream sglang dp_attention.py (compute_dp_attention_local_info):
  if not enable_dp_attention:
      return tp_rank, tp_size, 0   # moe_dense_tp_size IGNORED
The flag is only honored when enable_dp_attention=True. Since we
already dropped DP-attention to avoid the fork's mxfp4_deepseek bug,
moe-dense-tp-size: 1 was a no-op.

Two valid paths:
  (a) re-enable DP-attention without DeepEP — speculative, never tested
  (b) drop to TP=4 — 1536/4=384 divides cleanly by 128, FP8 quant
      passes. Matches NVIDIA/srt-slurm PR #75 (the only verified-
      working DSV4 sglang disagg recipe upstream) verbatim.

Going with (b). Recipes drop moe-dense-tp-size (no longer needed at
TP=4) and switch tensor-parallel-size to 4 in both prefill+decode.
gpus_per_prefill / gpus_per_decode drop to 4 (single GB200 node per
worker). prefill_nodes / decode_nodes track worker counts.

Topology shape (filenames keep historical dep8/dep16 naming for
symmetry with the vLLM #1129 sibling; actual config is TP=4):
  - 1k1k 1p1d-tep8:    P TP=4 / D TP=4 (2 nodes total)
  - 1k1k 1p1d-dep16:   P TP=4 / D TP=4 (2 nodes total) — same shape, different conc
  - 1k1k 3p1d-dep16:   P 3*TP=4 / D TP=4 (4 nodes)
  - 8k1k 1p1d-tep8:    P TP=4 / D TP=4 (2 nodes)
  - 8k1k 3p1d-dep16:   P 3*TP=4 / D TP=4 (4 nodes)
  - 8k1k 7p1d-dep16:   P 7*TP=4 / D TP=4 (8 nodes)

nvidia-master.yaml updated to match (tp: 4, ep: 1, dp-attn: false on
every prefill+decode block — including the commented 8k/1k block).

Also bumped SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK 1024 → 2048
in all env blocks (DeepEP path is dormant in this config but the env
var is in place for re-enabling later).
---
 .github/configs/nvidia-master.yaml            | 94 ++++++++++---------
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    | 18 ++--
 .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 18 ++--
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 18 ++--
 .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 18 ++--
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 18 ++--
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 18 ++--
 7 files changed, 97 insertions(+), 105 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index b2d361f65..edc142380 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7677,58 +7677,62 @@ dsv4-fp4-gb200-dynamo-sglang:
   multinode: true
   disagg: true
   seq-len-configs:
-  # 1k/1k — hand-rolled. NVIDIA/srt-slurm has no DSV4 sglang disagg
-  # recipe yet; topologies match the dsv4-fp4-gb200-dynamo-vllm sibling
-  # so framework-level numbers are directly comparable. Per-worker
-  # tunings cross-reference benchmarks/single_node/dsv4_fp4_b200.sh and
-  # NVIDIA/srt-slurm@sa-submission-q2-2026 recipes/gb200-fp4/1k1k/*.yaml
-  # (DSR1 sglang disagg structure).
+  # 1k/1k — TP-only single-node workers (matches NVIDIA/srt-slurm PR #75
+  # GB300 DSV4 sglang disagg, the only verified-working DSV4 sglang
+  # disagg recipe upstream). The lmsysorg/sglang:deepseek-v4-grace-
+  # blackwell image's sglang fork has a fork-only mxfp4_deepseek bug
+  # (does not exist in upstream sgl-project/sglang) that crashes the
+  # DeepEP path, and at TP=8 the shared-experts gate_up_proj fails
+  # FP8 block-quant divisibility (1536/8=192, not divisible by 128).
+  # TP=4 (1536/4=384) clears both — see recipe headers for the full chain.
+  # Filenames keep the historical 'dep8'/'dep16' tag for symmetry with
+  # the dsv4-fp4-gb200-dynamo-vllm sibling; the actual recipe is TP=4.
   - isl: 1024
     osl: 1024
     search-space:
-    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes.
+    # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes.
     - conc-list: [1, 4, 8, 16, 32, 64]
       prefill:
         num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
+        tp: 4
+        ep: 1
+        dp-attn: false
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
       decode:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
-    # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes.
+    # Mid throughput: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes.
     - conc-list: [128, 256, 1024, 2048, 4096]
       prefill:
         num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
+        tp: 4
+        ep: 1
+        dp-attn: false
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-    # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
-    # 4096 overlap with the 1p1d block gives a topology-crossover A/B.
+        tp: 4
+        ep: 1
+        dp-attn: false
+    # High throughput: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes.
+    # 4096 overlap with the 1p1d block gives a prefill-scaling A/B.
     - conc-list: [4096, 8192]
       prefill:
         num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
+        tp: 4
+        ep: 1
+        dp-attn: false
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
+        tp: 4
+        ep: 1
+        dp-attn: false
 
   # 8k/1k block kept commented out — same rationale as the dsv4-fp4-
   # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded.
@@ -7736,45 +7740,45 @@ dsv4-fp4-gb200-dynamo-sglang:
   # - isl: 8192
   #   osl: 1024
   #   search-space:
-  #   # Low-concurrency: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes.
+  #   # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes.
   #   - conc-list: [1, 4, 8, 16, 32, 64]
   #     prefill:
   #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
+  #       tp: 4
+  #       ep: 1
+  #       dp-attn: false
   #       additional-settings:
   #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
   #     decode:
   #       num-worker: 1
-  #       tp: 8
+  #       tp: 4
   #       ep: 1
   #       dp-attn: false
-  #   # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
+  #   # Mid: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes.
   #   - conc-list: [512, 1024]
   #     prefill:
   #       num-worker: 3
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
+  #       tp: 4
+  #       ep: 1
+  #       dp-attn: false
   #       additional-settings:
   #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
   #     decode:
   #       num-worker: 1
-  #       tp: 16
-  #       ep: 16
-  #       dp-attn: true
-  #   # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes.
+  #       tp: 4
+  #       ep: 1
+  #       dp-attn: false
+  #   # Max throughput: 7 prefills (TP=4) + 1 decode (TP=4). 8 nodes.
   #   - conc-list: [4096, 8192]
   #     prefill:
   #       num-worker: 7
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
+  #       tp: 4
+  #       ep: 1
+  #       dp-attn: false
   #       additional-settings:
   #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
   #     decode:
   #       num-worker: 1
-  #       tp: 16
-  #       ep: 16
-  #       dp-attn: true
+  #       tp: 4
+  #       ep: 1
+  #       dp-attn: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 4a6397649..2833331d1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -28,12 +28,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 4
+  prefill_nodes: 1
+  decode_nodes: 1
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -50,7 +50,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -60,14 +60,13 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -83,8 +82,7 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 16
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index cc67a2cb6..8b9603422 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -54,12 +54,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
+  prefill_nodes: 1
+  decode_nodes: 1
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -79,7 +79,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -89,14 +89,13 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -112,8 +111,7 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 6a4258a8a..3115a0317 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -29,12 +29,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 6
-  decode_nodes: 4
+  prefill_nodes: 3
+  decode_nodes: 1
   prefill_workers: 3
   decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -51,7 +51,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -61,14 +61,13 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -84,8 +83,7 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 16
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 8024a769f..dd09ba086 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -28,12 +28,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
+  prefill_nodes: 1
+  decode_nodes: 1
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -50,7 +50,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -60,14 +60,13 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -83,8 +82,7 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 4d997ec99..5a4bf4927 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -27,12 +27,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 6
-  decode_nodes: 4
+  prefill_nodes: 3
+  decode_nodes: 1
   prefill_workers: 3
   decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -49,7 +49,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -59,14 +59,13 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -82,8 +81,7 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 16
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index ac26318aa..b17d5e08f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -28,12 +28,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 14
-  decode_nodes: 4
+  prefill_nodes: 7
+  decode_nodes: 1
   prefill_workers: 7
   decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -50,7 +50,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -60,14 +60,13 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
 
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -83,8 +82,7 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 16
-      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true

From 8316d3f1bc21c831fbb1153ebdfc0fcb87b96b32 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 16:56:02 -0700
Subject: [PATCH 11/56] Restore mi355x retry changelog entries clobbered by
 merge

The merge of main into this branch (c0aec939) accidentally overwrote
the two dsv4-fp8-mi355x-sglang retry entries (PR #1148 retry-pair tail
and PR #1159 retry-pair) with duplicated copies of our own
dsv4-fp4-gb200-dynamo-sglang entry. The process_changelog.py gate
rejects deletions, so the workflow blocked.

Restore the two mi355x entries verbatim from origin/main and keep a
single copy of our dsv4 entry, appended after the restored mi355x
block. perf-changelog.yaml diff vs origin/main is now additions-only.
---
 perf-changelog.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c0c907b88..5312db2fe 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1820,6 +1820,21 @@
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132
 
+- config-keys:
+    - dsv4-fp8-mi355x-sglang
+  description:
+    - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh"
+    - "Bump --chunked-prefill-size from 4096 to 8192"
+    - "Retrigger dsv4-fp8-mi355x-sglang"
+
+- config-keys:
+    - dsv4-fp8-mi355x-sglang
+  description:
+    - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh"
+    - "Bump --chunked-prefill-size from 4096 to 8192"
+    - "Retrigger dsv4-fp8-mi355x-sglang"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1159
+
 - config-keys:
     - dsv4-fp4-gb200-dynamo-sglang
   description:

From f089567835284074bf161e40e7d1b75a373da5bf Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sat, 25 Apr 2026 17:36:53 -0700
Subject: [PATCH 12/56] Switch back to TP=8: enable-dp-attention +
 moe-dense-tp-size: 1, no moe-a2a-backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TP=4 OOMed — DSV4-Pro at MXFP4 doesn't fit on a single GB200 node.
Need TP=8 across 2 nodes (768 GB total).

But TP=8 trips two issues that earlier rounds papered over:
  a) shared-experts gate_up_proj FP8 block-quant divisibility
     (1536/8=192, not a multiple of block_n=128)
  b) the lmsysorg/sglang:deepseek-v4-grace-blackwell fork's
     mxfp4_deepseek kernel crashes on every DeepEP forward path

Single combo that solves both — verified in upstream sglang source:
  * enable-dp-attention: true  +  moe-dense-tp-size: 1
    Runs dense / shared-MLP layers replicated (TP=1) — fixes (a).
    moe-dense-tp-size IS gated on enable_dp_attention=True per
    python/sglang/srt/layers/dp_attention.py
    (compute_dp_attention_local_info ignores it when DP-attn is off).
  * NO moe-a2a-backend set (default 'none')
    Lands the model on forward_normal instead of forward_deepep —
    avoids (b). Verified in deepseek_v2.py:
      _enable_a2a_moe = is_deepep | is_mooncake | is_nixl | is_mori
                       | is_ascend_fuseep | is_flashinfer
    With backend='none' this is False and forward_normal runs.

Recipes: tensor-parallel-size 4 → 8 (both prefill+decode); add
moe-dense-tp-size: 1, enable-dp-attention: true, dp-size: 8 to every
sglang_config block; gpus_per_prefill / gpus_per_decode 4 → 8;
prefill_nodes / decode_nodes scale to workers × 2.

nvidia-master.yaml mirrors: tp 4 → 8, dp-attn false → true on every
prefill+decode block (active 1k/1k + commented 8k/1k). Topology shape
restored to:
  - 1k1k 1p1d-* : 4 nodes (was 2)
  - 1k1k 3p1d-* : 8 nodes (was 4)
  - 8k1k 1p1d-* : 4 nodes (commented)
  - 8k1k 3p1d-* : 8 nodes (commented)
  - 8k1k 7p1d-* : 16 nodes (commented)
---
 .github/configs/nvidia-master.yaml            | 86 ++++++++++---------
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    | 18 ++--
 .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 57 +++++++-----
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 18 ++--
 .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 18 ++--
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 18 ++--
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 18 ++--
 7 files changed, 143 insertions(+), 90 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index edc142380..272f32702 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7677,62 +7677,68 @@ dsv4-fp4-gb200-dynamo-sglang:
   multinode: true
   disagg: true
   seq-len-configs:
-  # 1k/1k — TP-only single-node workers (matches NVIDIA/srt-slurm PR #75
-  # GB300 DSV4 sglang disagg, the only verified-working DSV4 sglang
-  # disagg recipe upstream). The lmsysorg/sglang:deepseek-v4-grace-
-  # blackwell image's sglang fork has a fork-only mxfp4_deepseek bug
-  # (does not exist in upstream sgl-project/sglang) that crashes the
-  # DeepEP path, and at TP=8 the shared-experts gate_up_proj fails
-  # FP8 block-quant divisibility (1536/8=192, not divisible by 128).
-  # TP=4 (1536/4=384) clears both — see recipe headers for the full chain.
-  # Filenames keep the historical 'dep8'/'dep16' tag for symmetry with
-  # the dsv4-fp4-gb200-dynamo-vllm sibling; the actual recipe is TP=4.
+  # 1k/1k — TP=8 (2 GB200 nodes per worker) with DP-attention but no
+  # DeepEP. The lmsysorg/sglang:deepseek-v4-grace-blackwell image's
+  # sglang fork has a fork-only mxfp4_deepseek kernel that crashes any
+  # DeepEP forward path (both DeepEPLLDispatchOutput and
+  # DeepEPNormalDispatchOutput lack the `topk_output` field the kernel
+  # reads). At TP=8 the shared-experts gate_up_proj would also fail
+  # FP8 block-quant divisibility (1536/8=192, not divisible by 128)
+  # unless `moe-dense-tp-size: 1` runs the dense MLP layers replicated
+  # — and that flag is gated on `enable_dp_attention=True` in sglang
+  # dp_attention.py. So: DP-attention on; `moe-a2a-backend` left at
+  # its default `"none"` — sglang `forward_normal` path runs (verified
+  # in deepseek_v2.py: `_enable_a2a_moe` is False unless backend is
+  # deepep|mooncake|nixl|mori|ascend_fuseep|flashinfer). Filenames keep
+  # the historical 'dep8'/'dep16' tag for symmetry with the dsv4-fp4-
+  # gb200-dynamo-vllm sibling; the actual recipe is TP=8 + DP=8 with
+  # all-reduce/all-gather MoE dispatch.
   - isl: 1024
     osl: 1024
     search-space:
-    # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes.
+    # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
     - conc-list: [1, 4, 8, 16, 32, 64]
       prefill:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
       decode:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
-        dp-attn: false
-    # Mid throughput: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes.
+        dp-attn: true
+    # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
     - conc-list: [128, 256, 1024, 2048, 4096]
       prefill:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
-        dp-attn: false
-    # High throughput: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes.
+        dp-attn: true
+    # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
     # 4096 overlap with the 1p1d block gives a prefill-scaling A/B.
     - conc-list: [4096, 8192]
       prefill:
         num-worker: 3
-        tp: 4
+        tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
 
   # 8k/1k block kept commented out — same rationale as the dsv4-fp4-
   # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded.
@@ -7740,45 +7746,45 @@ dsv4-fp4-gb200-dynamo-sglang:
   # - isl: 8192
   #   osl: 1024
   #   search-space:
-  #   # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes.
+  #   # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
   #   - conc-list: [1, 4, 8, 16, 32, 64]
   #     prefill:
   #       num-worker: 1
-  #       tp: 4
+  #       tp: 8
   #       ep: 1
-  #       dp-attn: false
+  #       dp-attn: true
   #       additional-settings:
   #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
   #     decode:
   #       num-worker: 1
-  #       tp: 4
+  #       tp: 8
   #       ep: 1
-  #       dp-attn: false
-  #   # Mid: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes.
+  #       dp-attn: true
+  #   # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
   #   - conc-list: [512, 1024]
   #     prefill:
   #       num-worker: 3
-  #       tp: 4
+  #       tp: 8
   #       ep: 1
-  #       dp-attn: false
+  #       dp-attn: true
   #       additional-settings:
   #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
   #     decode:
   #       num-worker: 1
-  #       tp: 4
+  #       tp: 8
   #       ep: 1
-  #       dp-attn: false
-  #   # Max throughput: 7 prefills (TP=4) + 1 decode (TP=4). 8 nodes.
+  #       dp-attn: true
+  #   # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes.
   #   - conc-list: [4096, 8192]
   #     prefill:
   #       num-worker: 7
-  #       tp: 4
+  #       tp: 8
   #       ep: 1
-  #       dp-attn: false
+  #       dp-attn: true
   #       additional-settings:
   #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
   #     decode:
   #       num-worker: 1
-  #       tp: 4
+  #       tp: 8
   #       ep: 1
-  #       dp-attn: false
+  #       dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 2833331d1..36a70076d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -28,12 +28,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
+  prefill_nodes: 2
+  decode_nodes: 2
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -66,7 +66,10 @@ backend:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -82,7 +85,10 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 8b9603422..e4a530f2a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -10,21 +10,32 @@ name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8"
 # Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross-
 # framework numbers stay directly comparable.
 #
-# Topology: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. Targets very
-# low concurrency (1-64).
+# Topology: 1 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 4 nodes.
+# Targets very low concurrency (1-64).
 #
-# Why TP-only (no DeepEP, no DP-attention, no EP): the
-# lmsysorg/sglang:deepseek-v4-grace-blackwell image's sglang fork ships
-# a fork-only quant kernel `mxfp4_deepseek.py` (does not exist in
-# upstream sgl-project/sglang) that reads `dispatch_output.topk_output`
-# at line 347. Neither `DeepEPNormalDispatchOutput` nor
-# `DeepEPLLDispatchOutput` exposes that field in this fork, so any
-# `forward_deepep` path in disagg crashes the prefill scheduler. PR #75
-# (the only upstream DSV4 sglang disagg recipe) takes the same TP-only
-# stance — defaults to `moe_a2a_backend="none"` (sglang server_args.py)
-# and lets TP shard the expert weights instead of sharding via EP.
-# We can re-introduce EP/DeepEP once the fork's mxfp4_deepseek dispatch
-# API mismatch is fixed.
+# Why TP=8 + DP-attention but NO `moe-a2a-backend` (default "none"):
+#   1. DSV4-Pro at MXFP4 is too large for TP=4 single-node — OOM.
+#      TP=8 across 2 GB200 nodes (8 GPUs * 96 GB = 768 GB) fits.
+#   2. The lmsysorg/sglang:deepseek-v4-grace-blackwell sglang fork
+#      ships a fork-only quant kernel `mxfp4_deepseek.py` that reads
+#      `dispatch_output.topk_output`. Neither `DeepEPLLDispatchOutput`
+#      nor `DeepEPNormalDispatchOutput` exposes that field in this
+#      fork, so `forward_deepep` always crashes the prefill scheduler.
+#      We must stay off the DeepEP path.
+#   3. At TP=8 the shared-experts gate_up_proj fails FP8 block-quant
+#      divisibility (1536/8=192, not divisible by block_n=128).
+#      `moe-dense-tp-size: 1` runs the dense MLP layers replicated
+#      (TP=1) so the divisibility check passes — but that flag is
+#      gated on `enable_dp_attention=True` in sglang
+#      `python/sglang/srt/layers/dp_attention.py`
+#      (`compute_dp_attention_local_info` returns the full `tp_size`
+#      and ignores `moe_dense_tp_size` when DP-attn is off).
+# So: `enable-dp-attention: true` + `dp-size: 8` (DP-attn active so
+# `moe-dense-tp-size: 1` takes effect) AND no `moe-a2a-backend` set.
+# The default `"none"` lands the MoE on `forward_normal` instead of
+# `forward_deepep` — verified in deepseek_v2.py:
+#   `_enable_a2a_moe = is_deepep|is_mooncake|is_nixl|is_mori|
+#    is_ascend_fuseep|is_flashinfer`  → False with default.
 
 model:
   path: "deepseek-v4-pro"
@@ -54,12 +65,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
+  prefill_nodes: 2
+  decode_nodes: 2
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -95,7 +106,10 @@ backend:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -111,7 +125,10 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 3115a0317..b37023e88 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -29,12 +29,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 3
-  decode_nodes: 1
+  prefill_nodes: 6
+  decode_nodes: 2
   prefill_workers: 3
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -67,7 +67,10 @@ backend:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -83,7 +86,10 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index dd09ba086..2d202d337 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -28,12 +28,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
+  prefill_nodes: 2
+  decode_nodes: 2
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -66,7 +66,10 @@ backend:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -82,7 +85,10 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 5a4bf4927..a901098a4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -27,12 +27,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 3
-  decode_nodes: 1
+  prefill_nodes: 6
+  decode_nodes: 2
   prefill_workers: 3
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -65,7 +65,10 @@ backend:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -81,7 +84,10 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index b17d5e08f..f17bd7e2f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -28,12 +28,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 7
-  decode_nodes: 1
+  prefill_nodes: 14
+  decode_nodes: 2
   prefill_workers: 7
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -66,7 +66,10 @@ backend:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -82,7 +85,10 @@ backend:
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       trust-remote-code: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
+      moe-dense-tp-size: 1
+      enable-dp-attention: true
+      dp-size: 8
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true

From 5b6eb2f36274103891cad70218c3af0940fc747b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 09:19:45 -0700
Subject: [PATCH 13/56] =?UTF-8?q?Scope=20sweep=20to=20high-conc=20DeepEP?=
 =?UTF-8?q?=20only=20=E2=80=94=20temporarily=20comment=201p1d=20blocks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Comment out the low-conc (1-64) and mid-conc (128-4096) search-space
entries in nvidia-master.yaml so the sweep iterates only on the high-
conc 3p1d-dep8-dep16 topology. Re-enable DeepEP on that one recipe to
exercise the EP path:

  3p1d-dep8-dep16 prefill+decode:
    + ep-size: 8
    + moe-a2a-backend: "deepep"
    + deepep-mode: low_latency
    (kept enable-dp-attention + moe-dense-tp-size: 1 + tp=8 / dp=8)

Master matrix label updated to ep=8 to reflect the recipe.

Sibling 1p1d recipes on disk are unchanged (still TP=8 + DP-attn,
no DeepEP). They are still referenced by the commented-out master
entries — restore them by uncommenting.
---
 .github/configs/nvidia-master.yaml            | 68 ++++++++++---------
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 27 ++++++--
 2 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 272f32702..87a810072 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7696,48 +7696,52 @@ dsv4-fp4-gb200-dynamo-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    - conc-list: [1, 4, 8, 16, 32, 64]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-    # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    - conc-list: [128, 256, 1024, 2048, 4096]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-    # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
-    # 4096 overlap with the 1p1d block gives a prefill-scaling A/B.
+    # Low-/mid-conc blocks temporarily commented out so the sweep
+    # exercises only the high-conc DeepEP topology below — uncomment
+    # to re-enable.
+    # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
+    # - conc-list: [1, 4, 8, 16, 32, 64]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
+    # - conc-list: [128, 256, 1024, 2048, 4096]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8)
+    # via DeepEP. 8 nodes. matrix label ep=8 reflects the recipe's
+    # ep-size: 8 + moe-a2a-backend: deepep.
     - conc-list: [4096, 8192]
       prefill:
         num-worker: 3
         tp: 8
-        ep: 1
+        ep: 8
         dp-attn: true
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
         tp: 8
-        ep: 1
+        ep: 8
         dp-attn: true
 
   # 8k/1k block kept commented out — same rationale as the dsv4-fp4-
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index b37023e88..be872d48f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -1,13 +1,20 @@
 name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
 
-# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the
-# upstream-reference list. Topology mirrors the dsv4-fp4-gb200-dynamo-
-# vllm sibling.
+# High-concurrency 4096/8192 topology — the only block left active in
+# nvidia-master.yaml right now while we iterate on the DeepEP path.
+# Sibling 1p1d recipes are kept on disk but their master.yaml entries
+# are temporarily commented out.
 #
-# Topology: 3 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 10 nodes.
-# Sized for conc 4096-8192 — at those concurrencies a single prefill
-# worker (the 1p1d-dep8-dep16 sibling) becomes the bottleneck since the
-# 1k prefill arrival rate exceeds what one DP=8 worker can sustain.
+# This recipe DOES enable DeepEP (moe-a2a-backend: deepep, ep-size: 8,
+# deepep-mode: low_latency). The two 1p1d siblings stay on the
+# `forward_normal` (none) backend. With the lmsysorg/sglang:deepseek-
+# v4-grace-blackwell fork's `mxfp4_deepseek` bug still present (see
+# ./disagg-gb200-1p1d-dep8-tep8.yaml header), this run is expected to
+# either crash in the same way or surface new behaviour — the goal is
+# to capture the failure mode under the actual disagg/EP topology.
+#
+# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 /
+# EP=8). 8 nodes. Sized for conc 4096-8192.
 
 model:
   path: "deepseek-v4-pro"
@@ -71,6 +78,9 @@ backend:
       moe-dense-tp-size: 1
       enable-dp-attention: true
       dp-size: 8
+      ep-size: 8
+      moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -90,6 +100,9 @@ backend:
       moe-dense-tp-size: 1
       enable-dp-attention: true
       dp-size: 8
+      ep-size: 8
+      moe-a2a-backend: "deepep"
+      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true

From b9135868d783e67c841edcff8cb64e05d5326615 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 09:22:43 -0700
Subject: [PATCH 14/56] tep fix + dep for high conc

---
 .github/configs/nvidia-master.yaml | 32 ++++++++++++++----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 87a810072..c886172ea 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7696,23 +7696,21 @@ dsv4-fp4-gb200-dynamo-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    # Low-/mid-conc blocks temporarily commented out so the sweep
-    # exercises only the high-conc DeepEP topology below — uncomment
-    # to re-enable.
-    # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    # - conc-list: [1, 4, 8, 16, 32, 64]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
+    # Mid-conc block temporarily commented out — uncomment to re-enable.
+    # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
+    - conc-list: [1, 4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
     # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
     # - conc-list: [128, 256, 1024, 2048, 4096]
     #   prefill:

From bca99eb5b539e68e36b2ed4038fc9bd9a4826190 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 09:45:19 -0700
Subject: [PATCH 15/56] sike no dpa

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c886172ea..1650385a2 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7710,7 +7710,7 @@ dsv4-fp4-gb200-dynamo-sglang:
         num-worker: 1
         tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
     # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
     # - conc-list: [128, 256, 1024, 2048, 4096]
     #   prefill:

From 5866658855a762dc2da9317c74e5c8f5034c676a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 10:13:15 -0700
Subject: [PATCH 16/56] =?UTF-8?q?Cap=20SGLANG=5FDEEPEP=5FNUM=5FMAX=5FDISPA?=
 =?UTF-8?q?TCH=5FTOKENS=5FPER=5FRANK=20at=201024=20=E2=80=94=20sglang=20LL?=
 =?UTF-8?q?=20hard=20ceiling?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DeepEP run (3p1d-dep8-dep16) crashed at:

  File '.../sglang/srt/layers/moe/token_dispatcher/deepep.py', line 325
    assert self.num_max_dispatch_tokens_per_rank <= 1024
  AssertionError

_DeepEPDispatcherImplLowLatency enforces a hard upper bound of 1024 in
low_latency mode. We had bumped the env var to 2048 to give headroom
above the earlier C++ side cap (deep_ep.cpp:1233 'x.size(0) <=
num_max_dispatch_tokens_per_rank'), but 2048 trips this Python-side
assertion at scheduler init. 1024 is the exactly-allowed value: high
enough to cover the cuda-graph-max-bs we use, low enough to satisfy
the LL dispatcher constructor.

Apply 2048 → 1024 across all 6 recipes (every prefill + decode env
block).
---
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 4 ++--
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml  | 4 ++--
 .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 4 ++--
 .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml  | 4 ++--
 .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 4 ++--
 .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 36a70076d..9b773b346 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -50,7 +50,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -60,7 +60,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index e4a530f2a..c8bcc16a1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -90,7 +90,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -100,7 +100,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index be872d48f..a84417a16 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -58,7 +58,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -68,7 +68,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 2d202d337..267e69dd5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -50,7 +50,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -60,7 +60,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index a901098a4..0bbf14313 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -49,7 +49,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -59,7 +59,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index f17bd7e2f..436c3b4aa 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -50,7 +50,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   decode_environment:
     PYTHONUNBUFFERED: "1"
@@ -60,7 +60,7 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
 
   sglang_config:
     prefill:

From c0fc3bbe0d2908940343fbd6e7676359c9e51966 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 11:52:47 -0700
Subject: [PATCH 17/56] Revert 3p1d-dep8-dep16 to no-DeepEP TP-only; uncomment
 full 1k/1k + 8k/1k sweep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DeepEP is broken on the lmsysorg/sglang:deepseek-v4-grace-blackwell
image — verified across three runs (deepep-mode auto/normal,
deepep-mode low_latency, and the latest 3p1d try). All hit the
fork-only mxfp4_deepseek.py:347 reading dispatch_output.topk_output,
which neither DeepEPLLDispatchOutput nor DeepEPNormalDispatchOutput
exposes in this fork. Cannot be fixed from the recipe — needs the
image rebuilt with mxfp4_deepseek patched, or an upstream sglang fix.

3p1d-dep8-dep16 recipe: drop ep-size, moe-a2a-backend, deepep-mode
from prefill+decode. Now matches the 1p1d siblings: TP=8 + DP=8 +
moe-dense-tp-size: 1, default 'none' a2a backend (forward_normal
path bypasses the buggy mxfp4_deepseek kernel).

nvidia-master.yaml:
  * Uncomment the 1k/1k mid-conc and 8k/1k blocks (low + mid + high).
  * 3p1d-dep8-dep16 matrix label ep: 8 → ep: 1 to match recipe.

Sweep now expands to 6 entries / 27 conc points (3 1k/1k + 3 8k/1k).
---
 .github/configs/nvidia-master.yaml            | 131 +++++++++---------
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    |  29 ++--
 2 files changed, 72 insertions(+), 88 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 6123d7e6e..30491567f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7720,7 +7720,6 @@ dsv4-fp4-gb200-dynamo-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    # Mid-conc block temporarily commented out — uncomment to re-enable.
     # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
     - conc-list: [1, 4, 8, 16, 32, 64]
       prefill:
@@ -7735,82 +7734,78 @@ dsv4-fp4-gb200-dynamo-sglang:
         tp: 8
         ep: 1
         dp-attn: true
-    # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    # - conc-list: [128, 256, 1024, 2048, 4096]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8)
-    # via DeepEP. 8 nodes. matrix label ep=8 reflects the recipe's
-    # ep-size: 8 + moe-a2a-backend: deepep.
+    # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
+    - conc-list: [128, 256, 1024, 2048, 4096]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: true
+    # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
+    # 4096 overlap with the 1p1d block gives a prefill-scaling A/B.
     - conc-list: [4096, 8192]
       prefill:
         num-worker: 3
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: true
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: true
 
-  # 8k/1k block kept commented out — same rationale as the dsv4-fp4-
-  # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded.
-  # Uncomment to re-enable (recipes are already in place).
-  # - isl: 8192
-  #   osl: 1024
-  #   search-space:
-  #   # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-  #   - conc-list: [1, 4, 8, 16, 32, 64]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 1
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 1
-  #       dp-attn: true
-  #   # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
-  #   - conc-list: [512, 1024]
-  #     prefill:
-  #       num-worker: 3
-  #       tp: 8
-  #       ep: 1
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 1
-  #       dp-attn: true
-  #   # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes.
-  #   - conc-list: [4096, 8192]
-  #     prefill:
-  #       num-worker: 7
-  #       tp: 8
-  #       ep: 1
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 1
-  #       dp-attn: true
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
+    - conc-list: [1, 4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: true
+    # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
+    - conc-list: [512, 1024]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: true
+    # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes.
+    - conc-list: [4096, 8192]
+      prefill:
+        num-worker: 7
+        tp: 8
+        ep: 1
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index a84417a16..0548de9ff 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -1,20 +1,15 @@
 name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
 
-# High-concurrency 4096/8192 topology — the only block left active in
-# nvidia-master.yaml right now while we iterate on the DeepEP path.
-# Sibling 1p1d recipes are kept on disk but their master.yaml entries
-# are temporarily commented out.
+# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no
+# DeepEP shape as the 1p1d siblings — see
+# ./disagg-gb200-1p1d-dep8-tep8.yaml header for the full constraint
+# chain (mxfp4_deepseek fork-bug → no DeepEP; FP8 block-quant → need
+# moe-dense-tp-size: 1; that flag → needs DP-attention; default `none`
+# moe-a2a-backend → forward_normal path bypasses the buggy kernel).
+# Adds prefill capacity (3 workers vs 1) for the high-conc tail —
+# single prefill saturates around conc 4096 at 1k prompts.
 #
-# This recipe DOES enable DeepEP (moe-a2a-backend: deepep, ep-size: 8,
-# deepep-mode: low_latency). The two 1p1d siblings stay on the
-# `forward_normal` (none) backend. With the lmsysorg/sglang:deepseek-
-# v4-grace-blackwell fork's `mxfp4_deepseek` bug still present (see
-# ./disagg-gb200-1p1d-dep8-tep8.yaml header), this run is expected to
-# either crash in the same way or surface new behaviour — the goal is
-# to capture the failure mode under the actual disagg/EP topology.
-#
-# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 /
-# EP=8). 8 nodes. Sized for conc 4096-8192.
+# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes.
 
 model:
   path: "deepseek-v4-pro"
@@ -78,9 +73,6 @@ backend:
       moe-dense-tp-size: 1
       enable-dp-attention: true
       dp-size: 8
-      ep-size: 8
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -100,9 +92,6 @@ backend:
       moe-dense-tp-size: 1
       enable-dp-attention: true
       dp-size: 8
-      ep-size: 8
-      moe-a2a-backend: "deepep"
-      deepep-mode: low_latency
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true

From bc9fccf49bdaaf4c75f028ae7b58e772c618e079 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Sun, 26 Apr 2026 21:57:32 -0700
Subject: [PATCH 18/56] Try moe-a2a-backend: flashinfer on 3p1d-dep8-dep16 for
 high-conc EP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DeepEP is dead in this image (mxfp4_deepseek.py:347 reads
dispatch_output.topk_output, neither DeepEPNormal nor DeepEPLL output
exposes that field). Smoke test the only other plausible EP backend
upstream sglang offers: flashinfer.

Per upstream docs/advanced_features/expert_parallelism.md, flashinfer
is the documented option for 'Large-scale EP deployments' and uses a
different dispatcher than DeepEP — its output class may or may not
trip the same mxfp4_deepseek bug. Per server_args.py _handle_a2a_moe,
flashinfer auto-sets SGLANG_MOE_NVFP4_DISPATCH=True and forces
ep_size = tp_size, so we set ep-size: 8 explicitly. Everything else
(TP=8 / DP=8 / moe-dense-tp-size: 1) stays so the FP8 block-quant
path remains valid.

Scope: 1k/1k 3p1d-dep8-dep16 only. If the EP path serves on this
image, port back to the 1p1d siblings; if it crashes the same way
DeepEP did, revert to the no-EP forward_normal path and accept the
TP-only pareto.

nvidia-master.yaml matrix labels for the 3p1d entry updated to ep=8
to match the recipe.
---
 .github/configs/nvidia-master.yaml            |  9 +++--
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 36 ++++++++++++++-----
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 21ed11dd1..6123bdf6d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7748,19 +7748,22 @@ dsv4-fp4-gb200-dynamo-sglang:
         ep: 1
         dp-attn: true
     # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
-    # 4096 overlap with the 1p1d block gives a prefill-scaling A/B.
+    # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) via
+    # flashinfer EP smoke test (DeepEP is dead in this image — see the
+    # recipe header). matrix labels ep=8 reflect the recipe's
+    # ep-size: 8 + moe-a2a-backend: flashinfer.
     - conc-list: [4096, 8192]
       prefill:
         num-worker: 3
         tp: 8
-        ep: 1
+        ep: 8
         dp-attn: true
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
         tp: 8
-        ep: 1
+        ep: 8
         dp-attn: true
 
   - isl: 8192
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 0548de9ff..e86224bca 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -1,15 +1,29 @@
 name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
 
-# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no
-# DeepEP shape as the 1p1d siblings — see
-# ./disagg-gb200-1p1d-dep8-tep8.yaml header for the full constraint
-# chain (mxfp4_deepseek fork-bug → no DeepEP; FP8 block-quant → need
-# moe-dense-tp-size: 1; that flag → needs DP-attention; default `none`
-# moe-a2a-backend → forward_normal path bypasses the buggy kernel).
-# Adds prefill capacity (3 workers vs 1) for the high-conc tail —
-# single prefill saturates around conc 4096 at 1k prompts.
+# High-concurrency 4096/8192 topology — flashinfer EP smoke test.
 #
-# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes.
+# DeepEP is dead on this image (mxfp4_deepseek.py:347 reads
+# dispatch_output.topk_output, neither DeepEPNormal nor DeepEPLL output
+# class exposes that field — see ./disagg-gb200-1p1d-dep8-tep8.yaml
+# header). This recipe tries `moe-a2a-backend: flashinfer` instead —
+# upstream sglang docs (docs/advanced_features/expert_parallelism.md)
+# call out flashinfer as the option for "Large-scale EP deployments",
+# and its dispatcher returns a different output class than DeepEP, so
+# the mxfp4_deepseek apply path may or may not trip the same bug.
+#
+# Per sglang server_args.py `_handle_a2a_moe`, flashinfer auto-sets
+# SGLANG_MOE_NVFP4_DISPATCH=True and forces ep_size = tp_size, so we
+# set ep-size: 8 explicitly. Keeps everything else (TP=8 / DP=8 /
+# moe-dense-tp-size: 1) so the FP8 block-quant path remains valid.
+#
+# Goal here is binary: does the EP path serve any real prefill batch
+# on this image, or does it crash the same way DeepEP did. If it
+# serves, copy this pattern back to the 1p1d siblings; if it crashes,
+# revert to the no-EP forward_normal path and accept the TP-only
+# pareto.
+#
+# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 /
+# EP=8). 8 nodes.
 
 model:
   path: "deepseek-v4-pro"
@@ -73,6 +87,8 @@ backend:
       moe-dense-tp-size: 1
       enable-dp-attention: true
       dp-size: 8
+      ep-size: 8
+      moe-a2a-backend: "flashinfer"
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -92,6 +108,8 @@ backend:
       moe-dense-tp-size: 1
       enable-dp-attention: true
       dp-size: 8
+      ep-size: 8
+      moe-a2a-backend: "flashinfer"
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true

From e6d8943c7f883904a4ea8bca774db51e6dd572cb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 00:09:34 -0700
Subject: [PATCH 19/56] =?UTF-8?q?Revert=20flashinfer=20EP=20attempt=20?=
 =?UTF-8?q?=E2=80=94=20accept=20TP-only=20pareto,=20every=20EP=20backend?=
 =?UTF-8?q?=20dead=20on=20this=20image?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

flashinfer EP smoke test (3p1d-dep8-dep16 1k/1k) crashed at startup:

  File '.../sglang/srt/server_args.py', line 2133, in _handle_a2a_moe
    assert self.moe_runner_backend in [...]
  AssertionError: Flashinfer MoE A2A is only supported with
                  flashinfer_cutlass moe runner backend

flashinfer_cutlass is FP8-only — won't load DSV4-Pro's MXFP4 weights.
The only path that satisfies the assertion would also fail at model
load. So flashinfer is unusable for DSV4 on any image that doesn't
ship a flashinfer_mxfp4_cutlass runner (which doesn't exist).

Combined with the earlier deepep failure (mxfp4_deepseek.py:347
AttributeError on dispatch_output.topk_output, both Normal and LL
dispatch types), every EP backend sglang exposes in this image is
dead. Remaining options (mooncake, nixl-ep, mori, ascend_fuseep) are
either Ascend-NPU-only or not wired into this image.

Revert 3p1d-dep8-dep16 recipe to no-EP TP-only (matches the 5 sibling
recipes) and master.yaml matrix labels (ep: 8 → ep: 1).

PR description's Known Issues section updated to a 4-row table
covering every EP backend tried and accepted as dead end.
---
 .github/configs/nvidia-master.yaml            |  9 ++--
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 41 +++++++------------
 2 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 6123bdf6d..21ed11dd1 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7748,22 +7748,19 @@ dsv4-fp4-gb200-dynamo-sglang:
         ep: 1
         dp-attn: true
     # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
-    # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) via
-    # flashinfer EP smoke test (DeepEP is dead in this image — see the
-    # recipe header). matrix labels ep=8 reflect the recipe's
-    # ep-size: 8 + moe-a2a-backend: flashinfer.
+    # 4096 overlap with the 1p1d block gives a prefill-scaling A/B.
     - conc-list: [4096, 8192]
       prefill:
         num-worker: 3
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: true
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: true
 
   - isl: 8192
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index e86224bca..96acb25f2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -1,29 +1,22 @@
 name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
 
-# High-concurrency 4096/8192 topology — flashinfer EP smoke test.
+# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no-EP
+# shape as the 1p1d siblings — see ./disagg-gb200-1p1d-dep8-tep8.yaml
+# header for the full constraint chain.
 #
-# DeepEP is dead on this image (mxfp4_deepseek.py:347 reads
-# dispatch_output.topk_output, neither DeepEPNormal nor DeepEPLL output
-# class exposes that field — see ./disagg-gb200-1p1d-dep8-tep8.yaml
-# header). This recipe tries `moe-a2a-backend: flashinfer` instead —
-# upstream sglang docs (docs/advanced_features/expert_parallelism.md)
-# call out flashinfer as the option for "Large-scale EP deployments",
-# and its dispatcher returns a different output class than DeepEP, so
-# the mxfp4_deepseek apply path may or may not trip the same bug.
+# Both EP backends available upstream (deepep, flashinfer) are dead on
+# this image:
+#   * deepep — mxfp4_deepseek.py:347 reads dispatch_output.topk_output;
+#     neither DeepEPNormalDispatchOutput nor DeepEPLLDispatchOutput
+#     exposes that field in this fork.
+#   * flashinfer — `_handle_a2a_moe` in server_args.py asserts
+#     "Flashinfer MoE A2A is only supported with flashinfer_cutlass
+#     moe runner backend", and flashinfer_cutlass is FP8-only — won't
+#     load DSV4-Pro's MXFP4 weights.
+# Adds prefill capacity (3 workers vs 1) for the high-conc tail —
+# single prefill saturates around conc 4096 at 1k prompts.
 #
-# Per sglang server_args.py `_handle_a2a_moe`, flashinfer auto-sets
-# SGLANG_MOE_NVFP4_DISPATCH=True and forces ep_size = tp_size, so we
-# set ep-size: 8 explicitly. Keeps everything else (TP=8 / DP=8 /
-# moe-dense-tp-size: 1) so the FP8 block-quant path remains valid.
-#
-# Goal here is binary: does the EP path serve any real prefill batch
-# on this image, or does it crash the same way DeepEP did. If it
-# serves, copy this pattern back to the 1p1d siblings; if it crashes,
-# revert to the no-EP forward_normal path and accept the TP-only
-# pareto.
-#
-# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 /
-# EP=8). 8 nodes.
+# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes.
 
 model:
   path: "deepseek-v4-pro"
@@ -87,8 +80,6 @@ backend:
       moe-dense-tp-size: 1
       enable-dp-attention: true
       dp-size: 8
-      ep-size: 8
-      moe-a2a-backend: "flashinfer"
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true
@@ -108,8 +99,6 @@ backend:
       moe-dense-tp-size: 1
       enable-dp-attention: true
       dp-size: 8
-      ep-size: 8
-      moe-a2a-backend: "flashinfer"
       moe-runner-backend: "flashinfer_mxfp4"
       chunked-prefill-size: 4096
       disable-flashinfer-autotune: true

From 1d27533a322c3016f27d7ddf305a0380accefd6b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 14:46:45 -0700
Subject: [PATCH 20/56] fix(sglang): bump 8k1k prefill max-running-requests
 from 4 to 8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sglang computes per-rank capacity as max_running_requests // dp_size.
With dp-size=8, a value of 4 floors to 0, hitting the
"max_running_request is zero" assertion in tp_worker.py:277.
Bump to 8 so each DP rank gets at least 1 slot — matches the
working 1p1d recipe.
---
 .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 2 +-
 .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 0bbf14313..291390321 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -75,7 +75,7 @@ backend:
       disable-radix-cache: true
       mem-fraction-static: 0.82
       context-length: 9280
-      max-running-requests: 4
+      max-running-requests: 8
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "prefill"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 436c3b4aa..e635de8f0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -76,7 +76,7 @@ backend:
       disable-radix-cache: true
       mem-fraction-static: 0.82
       context-length: 9280
-      max-running-requests: 4
+      max-running-requests: 8
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "prefill"

From df1c783af91d2a2cfe4cbd74e839cc609ce37a4b Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Mon, 27 Apr 2026 21:05:54 -0700
Subject: [PATCH 21/56] ports

---
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 2 ++
 .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml    | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml   | 2 ++
 .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml   | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 9b773b346..d309562a1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -80,6 +80,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "prefill"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
     decode:
@@ -100,6 +101,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "decode"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
 benchmark:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index c8bcc16a1..e20c9c0a2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -120,6 +120,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "prefill"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
     decode:
@@ -140,6 +141,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "decode"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
 benchmark:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 96acb25f2..a8a161798 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -90,6 +90,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "prefill"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
     decode:
@@ -110,6 +111,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "decode"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
 benchmark:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 267e69dd5..218ad01f6 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -80,6 +80,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "prefill"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
     decode:
@@ -100,6 +101,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "decode"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
 benchmark:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 291390321..a1fd14571 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -79,6 +79,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "prefill"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
     decode:
@@ -99,6 +100,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "decode"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
 benchmark:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index e635de8f0..4eb0f2716 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -80,6 +80,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "prefill"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
     decode:
@@ -100,6 +101,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       disaggregation-mode: "decode"
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
 benchmark:

From 513cbef2d45f095994e4e32a7322fcd919ecb7da Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Tue, 28 Apr 2026 12:03:07 -0700
Subject: [PATCH 22/56] Dsv4 fp4 gb200 dynamo sglang disagg (#1213)

* Modify deepseek-v4 configuration for new model settings

* Update YAML configuration for deepseek model
---
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 246 ++++++++++++------
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 234 ++++++++++++-----
 2 files changed, 325 insertions(+), 155 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index a8a161798..6dddf8204 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -1,123 +1,203 @@
-name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
-
-# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no-EP
-# shape as the 1p1d siblings — see ./disagg-gb200-1p1d-dep8-tep8.yaml
-# header for the full constraint chain.
-#
-# Both EP backends available upstream (deepep, flashinfer) are dead on
-# this image:
-#   * deepep — mxfp4_deepseek.py:347 reads dispatch_output.topk_output;
-#     neither DeepEPNormalDispatchOutput nor DeepEPLLDispatchOutput
-#     exposes that field in this fork.
-#   * flashinfer — `_handle_a2a_moe` in server_args.py asserts
-#     "Flashinfer MoE A2A is only supported with flashinfer_cutlass
-#     moe runner backend", and flashinfer_cutlass is FP8-only — won't
-#     load DSV4-Pro's MXFP4 weights.
-# Adds prefill capacity (3 workers vs 1) for the high-conc tail —
-# single prefill saturates around conc 4096 at 1k prompts.
-#
-# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes.
+name: "dsv4-pro-gb300-fp4"
 
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
+slurm:
+  partition: hpc-mid
+  time_limit: "03:00:00"
+
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
 
-# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
 dynamo:
-  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
-  install: true
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
 
-slurm:
-  time_limit: "8:00:00"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
+  nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh
 
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
 
 resources:
-  gpu_type: "gb200"
+  gpu_type: "gb300"
   gpus_per_node: 4
-  prefill_nodes: 6
-  decode_nodes: 2
-  prefill_workers: 3
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
+  # prefill_nodes / prefill_workers / decode_nodes / decode_workers are
+  # set per-override; not duplicated in base.
 
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
+extra_mount:
+  - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang"
+  - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang"
+
+# setup_script: "install_sglang.sh"
 
 backend:
   type: sglang
 
   prefill_environment:
+    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
 
   decode_environment:
+    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
 
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30 # pr50 sets it, let's do it
+      # tokenizer-worker-num: 16  # need this if we run tokenizer
+
+      # Parallel
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+
       enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 3072
-      max-running-requests: 16
-      stream-interval: 50
-      decode-log-interval: 1000
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
       disaggregation-mode: "prefill"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+      # disable-radix-cache: true # NOTE try to enable radix cache
 
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 3072
-      max-running-requests: 1024
-      cuda-graph-max-bs: 1024
-      stream-interval: 50
-      decode-log-interval: 1000
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30 # pr50 sets it, let's do it
+      # tokenizer-worker-num: 16  # need this if we run tokenizer
+      # disable-radix-cache: true # NOTE try to enable radix cache
+
       disaggregation-mode: "decode"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
+      disaggregation-transfer-backend: mooncake
+
+      # tensor-parallel-size / data-parallel-size / expert-parallel-size
+      # / max-running-requests / cuda-graph-max-bs are set per-override.
+
+      mem-fraction-static: 0.94
+      swa-full-tokens-ratio: 0.15
+      context-length: 16384
+
+  benchmark:
+    type: custom
+    command: |
+      set -e
+      REPO=/configs/upstream-sa-bench/InferenceX
+      [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO"
+      cd "$REPO/utils/bench_serving"
+      python3 benchmark_serving.py \
+        --backend sglang --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \
+        --host 127.0.0.1 --port 8000 --endpoint /v1/completions \
+        --dataset-name random \
+        --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \
+        --random-num-workers 96 \
+        --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \
+        --num-warmups 512 \
+        --ignore-eos --trust-remote-code \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --save-result --result-dir /logs --result-filename results.json
+    # concurrencies set per-override
+
+############ 1k1k ##############
+# [0]is wideep, [1] is narrow ep
+zip_override_1k1k_hightpt:
+  resources:
+    prefill_nodes:   [7, 1]
+    prefill_workers: [7, 1]
+    decode_nodes:    [2, 2]
+    decode_workers:  [1, 1]
+  backend:
+    sglang_config:
+      decode:
+        tensor-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
+        data-parallel-size:       [8, 8] # NOTE change from 16gpu to 8gpu
+        expert-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
+
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+
+        # ep-num-redundant-experts + ep-dispatch-algorithm intentionally
+        # removed: no static dispatching file available yet.
+
+        moe-a2a-backend: "deepep"
+        deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+        max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu
+        cuda-graph-max-bs:    [1152,  32]
+
+        # benchmark:
+        # isl: 1024
+        # osl: 1024
+        # concurrencies: "16384"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 4eb0f2716..dacb0f9bd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -1,113 +1,203 @@
-name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
+name: "dsv4-pro-gb300-fp4"
 
-# 8k/1k max-throughput topology: 7 prefill (DP=8 EP=8) + 1 wide decode
-# (DP=16 EP=16). 18 nodes — full GB200 cluster. Targets conc 4096-8192.
-# Per-worker tunings identical to the 3p1d sibling; only prefill_workers
-# and prefill_nodes scale up.
-#
-# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference
-# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling.
+slurm:
+  partition: hpc-mid
+  time_limit: "03:00:00"
 
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
 
-# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
 dynamo:
-  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
-  install: true
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
 
-slurm:
-  time_limit: "8:00:00"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
+  nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh
 
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
 
 resources:
-  gpu_type: "gb200"
+  gpu_type: "gb300"
   gpus_per_node: 4
-  prefill_nodes: 14
-  decode_nodes: 2
-  prefill_workers: 7
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
+  # prefill_nodes / prefill_workers / decode_nodes / decode_workers are
+  # set per-override; not duplicated in base.
 
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
+extra_mount:
+  - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang"
+  - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang"
+
+# setup_script: "install_sglang.sh"
 
 backend:
   type: sglang
 
   prefill_environment:
+    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
 
   decode_environment:
+    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
 
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30 # pr50 sets it, let's do it
+      # tokenizer-worker-num: 16  # need this if we run tokenizer
+
+      # Parallel
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+
       enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 9280
-      max-running-requests: 8
-      stream-interval: 50
-      decode-log-interval: 1000
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
       disaggregation-mode: "prefill"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+      # disable-radix-cache: true # NOTE try to enable radix cache
 
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
       trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 9280
-      max-running-requests: 256
-      cuda-graph-max-bs: 256
-      stream-interval: 50
-      decode-log-interval: 1000
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30 # pr50 sets it, let's do it
+      # tokenizer-worker-num: 16  # need this if we run tokenizer
+      # disable-radix-cache: true # NOTE try to enable radix cache
+
       disaggregation-mode: "decode"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
+      disaggregation-transfer-backend: mooncake
+
+      # tensor-parallel-size / data-parallel-size / expert-parallel-size
+      # / max-running-requests / cuda-graph-max-bs are set per-override.
+
+      mem-fraction-static: 0.94
+      swa-full-tokens-ratio: 0.15
+      context-length: 16384
+
+  benchmark:
+    type: custom
+    command: |
+      set -e
+      REPO=/configs/upstream-sa-bench/InferenceX
+      [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO"
+      cd "$REPO/utils/bench_serving"
+      python3 benchmark_serving.py \
+        --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \
+        --host 127.0.0.1 --port 8000 --endpoint /v1/completions \
+        --dataset-name random \
+        --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 \
+        --random-num-workers 96 \
+        --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \
+        --num-warmups 512 \
+        --ignore-eos --trust-remote-code \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --save-result --result-dir /logs --result-filename results.json
+    # concurrencies set per-override
+
+############ 8k1k ##############
+# [0]is wideep, [1] is narrow ep
+zip_override_8k1k_hightpt:
+  resources:
+    prefill_nodes:   [7, 1]
+    prefill_workers: [7, 1]
+    decode_nodes:    [2, 2]
+    decode_workers:  [1, 1]
+  backend:
+    sglang_config:
+      decode:
+        tensor-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
+        data-parallel-size:       [8, 8] # NOTE change from 16gpu to 8gpu
+        expert-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
+
+        enable-dp-attention: true
+        enable-dp-lm-head: true
+
+        # ep-num-redundant-experts + ep-dispatch-algorithm intentionally
+        # removed: no static dispatching file available yet.
+
+        moe-a2a-backend: "deepep"
+        deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+        max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu
+        cuda-graph-max-bs:    [1152,  32]
+
+        # benchmark:
+        # isl: 8192
+        # osl: 1024
+        # concurrencies: "16384"

From b27c8da37878535c5a1d9e092be8140d360885b5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Tue, 28 Apr 2026 12:36:51 -0700
Subject: [PATCH 23/56] adapt for model path, etc

---
 .github/configs/nvidia-master.yaml            | 127 +++++++-------
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 162 ++++++++---------
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 163 ++++++++----------
 3 files changed, 207 insertions(+), 245 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0b43c4549..1c85aeab2 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7746,36 +7746,40 @@ dsv4-fp4-gb200-dynamo-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    - conc-list: [1, 4, 8, 16, 32, 64]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-    # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    - conc-list: [128, 256, 1024, 2048, 4096]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-    # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
-    # 4096 overlap with the 1p1d block gives a prefill-scaling A/B.
+    # Low/mid-concurrency entries (1p1d-dep8-tep8 and 1p1d-dep8-dep16
+    # recipes) commented out: PR #1213 only refreshed the 3p1d-dep8-dep16
+    # high-throughput recipe; the 1p1d siblings still match the older
+    # operational shape and are out of scope for the PR #1213 sweep.
+    # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
+    # - conc-list: [1, 4, 8, 16, 32, 64]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
+    # - conc-list: [128, 256, 1024, 2048, 4096]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    # High throughput: 3 prefills (TP=4 / DP=4 / EP=4) + 1 decode
+    # (TP=8 / DP=8 / EP=8 wideep). 5 nodes. Refreshed by PR #1213.
     - conc-list: [4096, 8192]
       prefill:
         num-worker: 3
@@ -7793,35 +7797,40 @@ dsv4-fp4-gb200-dynamo-sglang:
   - isl: 8192
     osl: 1024
     search-space:
-    # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    - conc-list: [1, 4, 8, 16, 32, 64]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-    # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
-    - conc-list: [512, 1024]
-      prefill:
-        num-worker: 3
-        tp: 8
-        ep: 1
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: true
-    # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes.
+    # Low/mid-concurrency entries (1p1d-dep8-tep8 and 3p1d-dep8-dep16
+    # recipes) commented out: PR #1213 only refreshed the 7p1d-dep8-dep16
+    # max-throughput recipe; the 1p1d/3p1d siblings still match the older
+    # operational shape and are out of scope for the PR #1213 sweep.
+    # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
+    # - conc-list: [1, 4, 8, 16, 32, 64]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    # # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
+    # - conc-list: [512, 1024]
+    #   prefill:
+    #     num-worker: 3
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 1
+    #     dp-attn: true
+    # Max throughput: 7 prefills (TP=4 / DP=4 / EP=4) + 1 decode
+    # (TP=8 / DP=8 / EP=8 wideep). 9 nodes. Refreshed by PR #1213.
     - conc-list: [4096, 8192]
       prefill:
         num-worker: 7
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 6dddf8204..ced4e1e5b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -1,45 +1,63 @@
-name: "dsv4-pro-gb300-fp4"
+name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
+
+# 1k/1k high-throughput topology for the wideep DSV4-Pro setup.
+#
+# Schema/values come from PR #1213 (513cbef) — that PR introduced the
+# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
+# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
+# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
+# support either: `zip_override_*_hightpt` rejects with `Unknown field`
+# and `benchmark` only validates at top level. So this file inlines the
+# wideep [0] override and lifts `benchmark` back out — same operational
+# values, schema the pinned srtctl will accept.
+#
+# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200
+# (matrix runs on gb200-nv runners, not gb300), container & model.path
+# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml
+# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`),
+# slurm.partition + sbatch_directives + extra_mount + nginx_container
+# dropped (they reference paths/partitions that exist only on the PR
+# author's gb300 cluster).
 
-slurm:
-  partition: hpc-mid
-  time_limit: "03:00:00"
-
-sbatch_directives:
-  cpus-per-task: "144"
-  mem: "0"
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
 
+# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
+# Hash bumped from PR #1213 to track the dynamo-sglang dsv4 dev branch.
 dynamo:
   hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+  install: true
 
-frontend:
-  type: dynamo
-  enable_multiple_frontends: true
-  num_additional_frontends: 8
-  nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh
+slurm:
+  time_limit: "8:00:00"
 
-model:
-  path: "dsv4-pro"
-  container: "dsv4-grace-blackwell"
-  precision: "fp4"
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
 
+# Topology: 3 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 5 nodes total.
 resources:
-  gpu_type: "gb300"
+  gpu_type: "gb200"
   gpus_per_node: 4
-  # prefill_nodes / prefill_workers / decode_nodes / decode_workers are
-  # set per-override; not duplicated in base.
-
-extra_mount:
-  - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang"
-  - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang"
+  prefill_nodes: 3
+  decode_nodes: 2
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
 
-# setup_script: "install_sglang.sh"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
 
 backend:
   type: sglang
 
   prefill_environment:
-    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
-    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache"
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
     SGLANG_ENABLE_THINKING: "1"
@@ -70,8 +88,7 @@ backend:
     SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
 
   decode_environment:
-    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
-    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache"
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
     SGLANG_ENABLE_THINKING: "1"
@@ -105,14 +122,11 @@ backend:
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
       trust-remote-code: true
       watchdog-timeout: 86400
       skip-tokenizer-init: true
-      stream-interval: 30 # pr50 sets it, let's do it
-      # tokenizer-worker-num: 16  # need this if we run tokenizer
+      stream-interval: 30
 
-      # Parallel
       tensor-parallel-size: 4
       data-parallel-size: 4
       expert-parallel-size: 4
@@ -128,76 +142,38 @@ backend:
       max-running-requests: 512
       cuda-graph-max-bs: 512
       chunked-prefill-size: 32768
-      # disable-radix-cache: true # NOTE try to enable radix cache
 
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
       trust-remote-code: true
       watchdog-timeout: 86400
       skip-tokenizer-init: true
-      stream-interval: 30 # pr50 sets it, let's do it
-      # tokenizer-worker-num: 16  # need this if we run tokenizer
-      # disable-radix-cache: true # NOTE try to enable radix cache
+      stream-interval: 30
+
+      # Wideep decode shape (zip_override [0] from PR #1213, inlined).
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
       disaggregation-mode: "decode"
       disaggregation-transfer-backend: mooncake
 
-      # tensor-parallel-size / data-parallel-size / expert-parallel-size
-      # / max-running-requests / cuda-graph-max-bs are set per-override.
-
       mem-fraction-static: 0.94
       swa-full-tokens-ratio: 0.15
       context-length: 16384
-
-  benchmark:
-    type: custom
-    command: |
-      set -e
-      REPO=/configs/upstream-sa-bench/InferenceX
-      [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO"
-      cd "$REPO/utils/bench_serving"
-      python3 benchmark_serving.py \
-        --backend sglang --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \
-        --host 127.0.0.1 --port 8000 --endpoint /v1/completions \
-        --dataset-name random \
-        --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \
-        --random-num-workers 96 \
-        --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \
-        --num-warmups 512 \
-        --ignore-eos --trust-remote-code \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --save-result --result-dir /logs --result-filename results.json
-    # concurrencies set per-override
-
-############ 1k1k ##############
-# [0]is wideep, [1] is narrow ep
-zip_override_1k1k_hightpt:
-  resources:
-    prefill_nodes:   [7, 1]
-    prefill_workers: [7, 1]
-    decode_nodes:    [2, 2]
-    decode_workers:  [1, 1]
-  backend:
-    sglang_config:
-      decode:
-        tensor-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
-        data-parallel-size:       [8, 8] # NOTE change from 16gpu to 8gpu
-        expert-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
-
-        enable-dp-attention: true
-        enable-dp-lm-head: true
-
-        # ep-num-redundant-experts + ep-dispatch-algorithm intentionally
-        # removed: no static dispatching file available yet.
-
-        moe-a2a-backend: "deepep"
-        deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-
-        max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu
-        cuda-graph-max-bs:    [1152,  32]
-
-        # benchmark:
-        # isl: 1024
-        # osl: 1024
-        # concurrencies: "16384"
+      max-running-requests: 9216
+      cuda-graph-max-bs: 1152
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4096x8192"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index dacb0f9bd..3a72d70f8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -1,45 +1,64 @@
-name: "dsv4-pro-gb300-fp4"
+name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
+
+# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
+#
+# Schema/values come from PR #1213 (513cbef) — that PR introduced the
+# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
+# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
+# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
+# support either: `zip_override_*_hightpt` rejects with `Unknown field`
+# and `benchmark` only validates at top level. So this file inlines the
+# wideep [0] override and lifts `benchmark` back out — same operational
+# values, schema the pinned srtctl will accept.
+#
+# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200
+# (matrix runs on gb200-nv runners, not gb300), container & model.path
+# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml
+# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`),
+# slurm.partition + sbatch_directives + extra_mount + nginx_container
+# dropped (they reference paths/partitions that exist only on the PR
+# author's gb300 cluster).
 
-slurm:
-  partition: hpc-mid
-  time_limit: "03:00:00"
-
-sbatch_directives:
-  cpus-per-task: "144"
-  mem: "0"
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
 
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
+# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
+# dev branch.
 dynamo:
   hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+  install: true
 
-frontend:
-  type: dynamo
-  enable_multiple_frontends: true
-  num_additional_frontends: 8
-  nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh
+slurm:
+  time_limit: "8:00:00"
 
-model:
-  path: "dsv4-pro"
-  container: "dsv4-grace-blackwell"
-  precision: "fp4"
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
 
+# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
 resources:
-  gpu_type: "gb300"
+  gpu_type: "gb200"
   gpus_per_node: 4
-  # prefill_nodes / prefill_workers / decode_nodes / decode_workers are
-  # set per-override; not duplicated in base.
-
-extra_mount:
-  - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang"
-  - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang"
+  prefill_nodes: 7
+  decode_nodes: 2
+  prefill_workers: 7
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
 
-# setup_script: "install_sglang.sh"
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
 
 backend:
   type: sglang
 
   prefill_environment:
-    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
-    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache"
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
     SGLANG_ENABLE_THINKING: "1"
@@ -70,8 +89,7 @@ backend:
     SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
 
   decode_environment:
-    # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug
-    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests
+    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache"
     PYTHONUNBUFFERED: "1"
     SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
     SGLANG_ENABLE_THINKING: "1"
@@ -105,14 +123,11 @@ backend:
   sglang_config:
     prefill:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
       trust-remote-code: true
       watchdog-timeout: 86400
       skip-tokenizer-init: true
-      stream-interval: 30 # pr50 sets it, let's do it
-      # tokenizer-worker-num: 16  # need this if we run tokenizer
+      stream-interval: 30
 
-      # Parallel
       tensor-parallel-size: 4
       data-parallel-size: 4
       expert-parallel-size: 4
@@ -128,76 +143,38 @@ backend:
       max-running-requests: 512
       cuda-graph-max-bs: 512
       chunked-prefill-size: 32768
-      # disable-radix-cache: true # NOTE try to enable radix cache
 
     decode:
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
       trust-remote-code: true
       watchdog-timeout: 86400
       skip-tokenizer-init: true
-      stream-interval: 30 # pr50 sets it, let's do it
-      # tokenizer-worker-num: 16  # need this if we run tokenizer
-      # disable-radix-cache: true # NOTE try to enable radix cache
+      stream-interval: 30
+
+      # Wideep decode shape (zip_override [0] from PR #1213, inlined).
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
       disaggregation-mode: "decode"
       disaggregation-transfer-backend: mooncake
 
-      # tensor-parallel-size / data-parallel-size / expert-parallel-size
-      # / max-running-requests / cuda-graph-max-bs are set per-override.
-
       mem-fraction-static: 0.94
       swa-full-tokens-ratio: 0.15
       context-length: 16384
-
-  benchmark:
-    type: custom
-    command: |
-      set -e
-      REPO=/configs/upstream-sa-bench/InferenceX
-      [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO"
-      cd "$REPO/utils/bench_serving"
-      python3 benchmark_serving.py \
-        --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \
-        --host 127.0.0.1 --port 8000 --endpoint /v1/completions \
-        --dataset-name random \
-        --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 \
-        --random-num-workers 96 \
-        --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \
-        --num-warmups 512 \
-        --ignore-eos --trust-remote-code \
-        --percentile-metrics ttft,tpot,itl,e2el \
-        --save-result --result-dir /logs --result-filename results.json
-    # concurrencies set per-override
-
-############ 8k1k ##############
-# [0]is wideep, [1] is narrow ep
-zip_override_8k1k_hightpt:
-  resources:
-    prefill_nodes:   [7, 1]
-    prefill_workers: [7, 1]
-    decode_nodes:    [2, 2]
-    decode_workers:  [1, 1]
-  backend:
-    sglang_config:
-      decode:
-        tensor-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
-        data-parallel-size:       [8, 8] # NOTE change from 16gpu to 8gpu
-        expert-parallel-size:     [8, 8] # NOTE change from 16gpu to 8gpu
-
-        enable-dp-attention: true
-        enable-dp-lm-head: true
-
-        # ep-num-redundant-experts + ep-dispatch-algorithm intentionally
-        # removed: no static dispatching file available yet.
-
-        moe-a2a-backend: "deepep"
-        deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-
-        max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu
-        cuda-graph-max-bs:    [1152,  32]
-
-        # benchmark:
-        # isl: 8192
-        # osl: 1024
-        # concurrencies: "16384"
+      max-running-requests: 9216
+      cuda-graph-max-bs: 1152
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096x8192"
+  req_rate: "inf"
+  use_chat_template: false

From 0dbc9a484bdcb0ee27d51b280fe54157b9526889 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 13:26:03 -0700
Subject: [PATCH 24/56] dev

---
 .../{1k1k => 1k1k-stale}/disagg-gb200-1p1d-dep8-dep16.yaml        | 0
 .../{1k1k => 1k1k-stale}/disagg-gb200-1p1d-dep8-tep8.yaml         | 0
 .../{1k1k => 1k1k-stale}/disagg-gb200-3p1d-dep8-dep16.yaml        | 0
 ...1p1d-dep8-tep8.yaml => stale-disagg-gb200-1p1d-dep8-tep8.yaml} | 0
 ...1d-dep8-dep16.yaml => stale-disagg-gb200-3p1d-dep8-dep16.yaml} | 0
 5 files changed, 0 insertions(+), 0 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/{1k1k => 1k1k-stale}/disagg-gb200-1p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/{1k1k => 1k1k-stale}/disagg-gb200-1p1d-dep8-tep8.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/{1k1k => 1k1k-stale}/disagg-gb200-3p1d-dep8-dep16.yaml (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-gb200-1p1d-dep8-tep8.yaml => stale-disagg-gb200-1p1d-dep8-tep8.yaml} (100%)
 rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-gb200-3p1d-dep8-dep16.yaml => stale-disagg-gb200-3p1d-dep8-dep16.yaml} (100%)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml
similarity index 100%
rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml

From ba72558eca41d413129b347140cbb17644996320 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 13:44:10 -0700
Subject: [PATCH 25/56] upd

---
 .../8k1k/disagg-gb300-2p1d-dep4-dep8.yaml     | 178 ++++++++++++++++++
 .../8k1k/disagg-gb300-7p1d-dep4-dep8.yaml     | 178 ++++++++++++++++++
 2 files changed, 356 insertions(+)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
new file mode 100644
index 000000000..bceffd528
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
@@ -0,0 +1,178 @@
+name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
+
+# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
+#
+# Schema/values come from PR #1213 (513cbef) — that PR introduced the
+# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
+# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
+# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
+# support either: `zip_override_*_hightpt` rejects with `Unknown field`
+# and `benchmark` only validates at top level. So this file inlines the
+# wideep [0] override and lifts `benchmark` back out — same operational
+# values, schema the pinned srtctl will accept.
+#
+# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200
+# (matrix runs on gb200-nv runners, not gb300), container & model.path
+# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml
+# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`),
+# slurm.partition + sbatch_directives + extra_mount + nginx_container
+# dropped (they reference paths/partitions that exist only on the PR
+# author's gb300 cluster).
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
+# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
+# dev branch.
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+  install: true
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+# Topology: 2 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 3 nodes total.
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30
+
+      # Wideep decode shape (zip_override [0] from PR #1213, inlined).
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.94
+      swa-full-tokens-ratio: 0.15
+      context-length: 16384
+      max-running-requests: 9216
+      cuda-graph-max-bs: 1152
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
new file mode 100644
index 000000000..731adeb13
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
@@ -0,0 +1,178 @@
+name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
+
+# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
+#
+# Schema/values come from PR #1213 (513cbef) — that PR introduced the
+# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
+# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
+# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
+# support either: `zip_override_*_hightpt` rejects with `Unknown field`
+# and `benchmark` only validates at top level. So this file inlines the
+# wideep [0] override and lifts `benchmark` back out — same operational
+# values, schema the pinned srtctl will accept.
+#
+# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200
+# (matrix runs on gb200-nv runners, not gb300), container & model.path
+# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml
+# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`),
+# slurm.partition + sbatch_directives + extra_mount + nginx_container
+# dropped (they reference paths/partitions that exist only on the PR
+# author's gb300 cluster).
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
+# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
+# dev branch.
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+  install: true
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 7
+  decode_nodes: 2
+  prefill_workers: 7
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 30
+
+      # Wideep decode shape (zip_override [0] from PR #1213, inlined).
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.94
+      swa-full-tokens-ratio: 0.15
+      context-length: 16384
+      max-running-requests: 9216
+      cuda-graph-max-bs: 1152
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "8192"
+  req_rate: "inf"
+  use_chat_template: false

From 7c81fe95d0cff00d439cbc2550dc867614bf9216 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 14:25:25 -0700
Subject: [PATCH 26/56] fix

---
 .github/configs/nvidia-master.yaml            | 105 ++--------
 .../disagg-gb200-1p1d-dep8-dep16.yaml         | 113 -----------
 .../disagg-gb200-1p1d-dep8-tep8.yaml          | 153 ---------------
 .../disagg-gb200-3p1d-dep8-dep16.yaml         | 179 -----------------
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 180 ------------------
 .../stale-disagg-gb200-1p1d-dep8-tep8.yaml    | 113 -----------
 .../stale-disagg-gb200-3p1d-dep8-dep16.yaml   | 112 -----------
 7 files changed, 19 insertions(+), 936 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1c85aeab2..aff5524b3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7717,38 +7717,22 @@ dsv4-fp4-gb200-dynamo-vllm:
         ep: 16
         dp-attn: true
 
-dsv4-fp4-gb200-dynamo-sglang:
+dsv4-fp4-gb300-dynamo-sglang:
   image: lmsysorg/sglang:deepseek-v4-grace-blackwell
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: gb200
+  runner: gb300
   precision: fp4
   framework: dynamo-sglang
   multinode: true
   disagg: true
   seq-len-configs:
-  # 1k/1k — TP=8 (2 GB200 nodes per worker) with DP-attention but no
-  # DeepEP. The lmsysorg/sglang:deepseek-v4-grace-blackwell image's
-  # sglang fork has a fork-only mxfp4_deepseek kernel that crashes any
-  # DeepEP forward path (both DeepEPLLDispatchOutput and
-  # DeepEPNormalDispatchOutput lack the `topk_output` field the kernel
-  # reads). At TP=8 the shared-experts gate_up_proj would also fail
-  # FP8 block-quant divisibility (1536/8=192, not divisible by 128)
-  # unless `moe-dense-tp-size: 1` runs the dense MLP layers replicated
-  # — and that flag is gated on `enable_dp_attention=True` in sglang
-  # dp_attention.py. So: DP-attention on; `moe-a2a-backend` left at
-  # its default `"none"` — sglang `forward_normal` path runs (verified
-  # in deepseek_v2.py: `_enable_a2a_moe` is False unless backend is
-  # deepep|mooncake|nixl|mori|ascend_fuseep|flashinfer). Filenames keep
-  # the historical 'dep8'/'dep16' tag for symmetry with the dsv4-fp4-
-  # gb200-dynamo-vllm sibling; the actual recipe is TP=8 + DP=8 with
-  # all-reduce/all-gather MoE dispatch.
-  - isl: 1024
+  - isl: 8192
     osl: 1024
     search-space:
-    # Low/mid-concurrency entries (1p1d-dep8-tep8 and 1p1d-dep8-dep16
-    # recipes) commented out: PR #1213 only refreshed the 3p1d-dep8-dep16
-    # high-throughput recipe; the 1p1d siblings still match the older
+    # Low/mid-concurrency entries (1p1d-dep8-tep8 and 3p1d-dep8-dep16
+    # recipes) commented out: PR #1213 only refreshed the 7p1d-dep8-dep16
+    # max-throughput recipe; the 1p1d/3p1d siblings still match the older
     # operational shape and are out of scope for the PR #1213 sweep.
     # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
     # - conc-list: [1, 4, 8, 16, 32, 64]
@@ -7758,89 +7742,38 @@ dsv4-fp4-gb200-dynamo-sglang:
     #     ep: 1
     #     dp-attn: true
     #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    # - conc-list: [128, 256, 1024, 2048, 4096]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
     #   decode:
     #     num-worker: 1
     #     tp: 8
     #     ep: 1
     #     dp-attn: true
-    # High throughput: 3 prefills (TP=4 / DP=4 / EP=4) + 1 decode
-    # (TP=8 / DP=8 / EP=8 wideep). 5 nodes. Refreshed by PR #1213.
-    - conc-list: [4096, 8192]
+    # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
+    - conc-list: [64]
       prefill:
-        num-worker: 3
-        tp: 8
-        ep: 1
+        num-worker: 2
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-2p1d-dep4-dep8.yaml"
       decode:
         num-worker: 1
         tp: 8
-        ep: 1
+        ep: 8
         dp-attn: true
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # Low/mid-concurrency entries (1p1d-dep8-tep8 and 3p1d-dep8-dep16
-    # recipes) commented out: PR #1213 only refreshed the 7p1d-dep8-dep16
-    # max-throughput recipe; the 1p1d/3p1d siblings still match the older
-    # operational shape and are out of scope for the PR #1213 sweep.
-    # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    # - conc-list: [1, 4, 8, 16, 32, 64]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    # # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
-    # - conc-list: [512, 1024]
-    #   prefill:
-    #     num-worker: 3
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
     # Max throughput: 7 prefills (TP=4 / DP=4 / EP=4) + 1 decode
     # (TP=8 / DP=8 / EP=8 wideep). 9 nodes. Refreshed by PR #1213.
-    - conc-list: [4096, 8192]
+    - conc-list: [8192]
       prefill:
         num-worker: 7
-        tp: 8
-        ep: 1
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml"
       decode:
         num-worker: 1
         tp: 8
-        ep: 1
+        ep: 8
         dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml
deleted file mode 100644
index d309562a1..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-name: "dsv4-sglang-disagg-gb200-1p1d-dep8-dep16"
-
-# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the
-# upstream-reference list (PR #69 GB200 agg, PR #75 GB300 disagg).
-# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling.
-#
-# Topology: 1 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 6 nodes.
-# Single prefill is enough for 1k prompts up to ~conc 4096 (per-rank
-# prefill TFlops at 1k ISL is high; matches the vLLM sibling sizing).
-
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
-
-# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
-dynamo:
-  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
-  install: true
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: sglang
-
-  prefill_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
-
-  decode_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
-
-  sglang_config:
-    prefill:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 3072
-      max-running-requests: 16
-      stream-interval: 50
-      decode-log-interval: 1000
-      disaggregation-mode: "prefill"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-    decode:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 3072
-      max-running-requests: 512
-      cuda-graph-max-bs: 512
-      stream-interval: 50
-      decode-log-interval: 1000
-      disaggregation-mode: "decode"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "128x256x1024x2048x4096"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml
deleted file mode 100644
index e20c9c0a2..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml
+++ /dev/null
@@ -1,153 +0,0 @@
-name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8"
-
-# Hand-rolled — no GB200 DSV4 sglang disagg recipe exists upstream. The
-# closest references on NVIDIA/srt-slurm are:
-#   * PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml) —
-#     GB200 DSV4 sglang AGGREGATED: per-worker flag set + env vars.
-#   * PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml) —
-#     GB300 DSV4 sglang DISAGG: confirms nixl + flashinfer_mxfp4 +
-#     chunked-prefill-size=4096 + disable-flashinfer-autotune.
-# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross-
-# framework numbers stay directly comparable.
-#
-# Topology: 1 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 4 nodes.
-# Targets very low concurrency (1-64).
-#
-# Why TP=8 + DP-attention but NO `moe-a2a-backend` (default "none"):
-#   1. DSV4-Pro at MXFP4 is too large for TP=4 single-node — OOM.
-#      TP=8 across 2 GB200 nodes (8 GPUs * 96 GB = 768 GB) fits.
-#   2. The lmsysorg/sglang:deepseek-v4-grace-blackwell sglang fork
-#      ships a fork-only quant kernel `mxfp4_deepseek.py` that reads
-#      `dispatch_output.topk_output`. Neither `DeepEPLLDispatchOutput`
-#      nor `DeepEPNormalDispatchOutput` exposes that field in this
-#      fork, so `forward_deepep` always crashes the prefill scheduler.
-#      We must stay off the DeepEP path.
-#   3. At TP=8 the shared-experts gate_up_proj fails FP8 block-quant
-#      divisibility (1536/8=192, not divisible by block_n=128).
-#      `moe-dense-tp-size: 1` runs the dense MLP layers replicated
-#      (TP=1) so the divisibility check passes — but that flag is
-#      gated on `enable_dp_attention=True` in sglang
-#      `python/sglang/srt/layers/dp_attention.py`
-#      (`compute_dp_attention_local_info` returns the full `tp_size`
-#      and ignores `moe_dense_tp_size` when DP-attn is off).
-# So: `enable-dp-attention: true` + `dp-size: 8` (DP-attn active so
-# `moe-dense-tp-size: 1` takes effect) AND no `moe-a2a-backend` set.
-# The default `"none"` lands the MoE on `forward_normal` instead of
-# `forward_deepep` — verified in deepseek_v2.py:
-#   `_enable_a2a_moe = is_deepep|is_mooncake|is_nixl|is_mori|
-#    is_ascend_fuseep|is_flashinfer`  → False with default.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
-
-# Pin dynamo to the v1.2.0-sglang-deepseek-v4-dev.1 tag. The PyPI
-# 0.8.0/0.8.1 releases (srtctl's default) reference `sgl.Engine` in
-# `dynamo.sglang.health_check` *eagerly* (no `from __future__ import
-# annotations`), and the lmsysorg/sglang:deepseek-v4-grace-blackwell
-# image's sglang fork does not expose `sgl.Engine`, so they crash at
-# import with `AttributeError: module 'sglang' has no attribute
-# 'Engine'`. The DSV4-targeted tag adds `from __future__ import
-# annotations` (commit cdb7218a, ai-dynamo PR #7255), making the
-# annotation lazy so the module imports cleanly.
-dynamo:
-  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
-  install: true
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: sglang
-
-  # Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline
-  # that's actually been run upstream) plus the disaggregation timeout
-  # triple — heartbeat 100k matches the DSR1 sglang disagg convention.
-  prefill_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
-
-  decode_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
-
-  sglang_config:
-    prefill:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 3072
-      max-running-requests: 16
-      stream-interval: 50
-      decode-log-interval: 1000
-      disaggregation-mode: "prefill"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-    decode:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 3072
-      max-running-requests: 64
-      cuda-graph-max-bs: 64
-      stream-interval: 50
-      decode-log-interval: 1000
-      disaggregation-mode: "decode"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x4x8x16x32x64"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml
deleted file mode 100644
index ced4e1e5b..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,179 +0,0 @@
-name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
-
-# 1k/1k high-throughput topology for the wideep DSV4-Pro setup.
-#
-# Schema/values come from PR #1213 (513cbef) — that PR introduced the
-# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
-# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
-# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
-# support either: `zip_override_*_hightpt` rejects with `Unknown field`
-# and `benchmark` only validates at top level. So this file inlines the
-# wideep [0] override and lifts `benchmark` back out — same operational
-# values, schema the pinned srtctl will accept.
-#
-# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200
-# (matrix runs on gb200-nv runners, not gb300), container & model.path
-# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml
-# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`),
-# slurm.partition + sbatch_directives + extra_mount + nginx_container
-# dropped (they reference paths/partitions that exist only on the PR
-# author's gb300 cluster).
-
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
-
-# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
-# Hash bumped from PR #1213 to track the dynamo-sglang dsv4 dev branch.
-dynamo:
-  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
-  install: true
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-# Topology: 3 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
-# (TP=8 / DP=8 / EP=8 / 2 nodes). 5 nodes total.
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 3
-  decode_nodes: 2
-  prefill_workers: 3
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: sglang
-
-  prefill_environment:
-    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache"
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    SGLANG_ENABLE_THINKING: "1"
-    SGLANG_REASONING_EFFORT: "max"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
-    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
-    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
-    SGLANG_OPT_USE_FAST_MASK_EP: "1"
-    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
-    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
-    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
-    MC_FORCE_MNNVL: "1"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
-    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_LOG_FORWARD_ITERS: "1"
-    SGLANG_LOG_MS: "1"
-    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
-
-  decode_environment:
-    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache"
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    SGLANG_ENABLE_THINKING: "1"
-    SGLANG_REASONING_EFFORT: "max"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
-    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
-    SGLANG_OPT_USE_FAST_MASK_EP: "1"
-    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
-    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
-    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
-    MC_FORCE_MNNVL: "1"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
-    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_LOG_FORWARD_ITERS: "1"
-    SGLANG_LOG_MS: "1"
-    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
-    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
-    # is single-node only and corrupts results in 2-node decode setups.
-
-  sglang_config:
-    prefill:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      watchdog-timeout: 86400
-      skip-tokenizer-init: true
-      stream-interval: 30
-
-      tensor-parallel-size: 4
-      data-parallel-size: 4
-      expert-parallel-size: 4
-
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-
-      disaggregation-mode: "prefill"
-      disaggregation-transfer-backend: mooncake
-
-      mem-fraction-static: 0.90
-      max-running-requests: 512
-      cuda-graph-max-bs: 512
-      chunked-prefill-size: 32768
-
-    decode:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      watchdog-timeout: 86400
-      skip-tokenizer-init: true
-      stream-interval: 30
-
-      # Wideep decode shape (zip_override [0] from PR #1213, inlined).
-      tensor-parallel-size: 8
-      data-parallel-size: 8
-      expert-parallel-size: 8
-
-      enable-dp-attention: true
-      enable-dp-lm-head: true
-
-      moe-a2a-backend: "deepep"
-      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-
-      disaggregation-mode: "decode"
-      disaggregation-transfer-backend: mooncake
-
-      mem-fraction-static: 0.94
-      swa-full-tokens-ratio: 0.15
-      context-length: 16384
-      max-running-requests: 9216
-      cuda-graph-max-bs: 1152
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
deleted file mode 100644
index 3a72d70f8..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,180 +0,0 @@
-name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
-
-# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
-#
-# Schema/values come from PR #1213 (513cbef) — that PR introduced the
-# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
-# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
-# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
-# support either: `zip_override_*_hightpt` rejects with `Unknown field`
-# and `benchmark` only validates at top level. So this file inlines the
-# wideep [0] override and lifts `benchmark` back out — same operational
-# values, schema the pinned srtctl will accept.
-#
-# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200
-# (matrix runs on gb200-nv runners, not gb300), container & model.path
-# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml
-# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`),
-# slurm.partition + sbatch_directives + extra_mount + nginx_container
-# dropped (they reference paths/partitions that exist only on the PR
-# author's gb300 cluster).
-
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
-
-# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
-# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
-# dev branch.
-dynamo:
-  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
-  install: true
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
-# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 7
-  decode_nodes: 2
-  prefill_workers: 7
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: sglang
-
-  prefill_environment:
-    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache"
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    SGLANG_ENABLE_THINKING: "1"
-    SGLANG_REASONING_EFFORT: "max"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
-    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
-    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
-    SGLANG_OPT_USE_FAST_MASK_EP: "1"
-    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
-    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
-    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
-    MC_FORCE_MNNVL: "1"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
-    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_LOG_FORWARD_ITERS: "1"
-    SGLANG_LOG_MS: "1"
-    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
-
-  decode_environment:
-    SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache"
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    SGLANG_ENABLE_THINKING: "1"
-    SGLANG_REASONING_EFFORT: "max"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
-    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
-    SGLANG_OPT_USE_FAST_MASK_EP: "1"
-    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
-    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
-    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
-    MC_FORCE_MNNVL: "1"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
-    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_LOG_FORWARD_ITERS: "1"
-    SGLANG_LOG_MS: "1"
-    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
-    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
-    # is single-node only and corrupts results in 2-node decode setups.
-
-  sglang_config:
-    prefill:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      watchdog-timeout: 86400
-      skip-tokenizer-init: true
-      stream-interval: 30
-
-      tensor-parallel-size: 4
-      data-parallel-size: 4
-      expert-parallel-size: 4
-
-      enable-dp-attention: true
-      moe-a2a-backend: "deepep"
-      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-
-      disaggregation-mode: "prefill"
-      disaggregation-transfer-backend: mooncake
-
-      mem-fraction-static: 0.90
-      max-running-requests: 512
-      cuda-graph-max-bs: 512
-      chunked-prefill-size: 32768
-
-    decode:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      watchdog-timeout: 86400
-      skip-tokenizer-init: true
-      stream-interval: 30
-
-      # Wideep decode shape (zip_override [0] from PR #1213, inlined).
-      tensor-parallel-size: 8
-      data-parallel-size: 8
-      expert-parallel-size: 8
-
-      enable-dp-attention: true
-      enable-dp-lm-head: true
-
-      moe-a2a-backend: "deepep"
-      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-
-      disaggregation-mode: "decode"
-      disaggregation-transfer-backend: mooncake
-
-      mem-fraction-static: 0.94
-      swa-full-tokens-ratio: 0.15
-      context-length: 16384
-      max-running-requests: 9216
-      cuda-graph-max-bs: 1152
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4096x8192"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml
deleted file mode 100644
index 218ad01f6..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8"
-
-# 8k/1k variant of the 1k/1k 1p1d-dep8-tep8 recipe. Same topology and
-# tuning; only context-length grows from 3072 (1k+1k+pad) to 9280
-# (8k+1k+pad), and prefill max-running-requests halves to keep the per-
-# rank prefill working set inside the GPU memory budget.
-#
-# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full upstream-
-# reference list (PR #69 GB200 agg, PR #75 GB300 disagg).
-
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
-
-# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
-dynamo:
-  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
-  install: true
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: sglang
-
-  prefill_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
-
-  decode_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
-
-  sglang_config:
-    prefill:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 9280
-      max-running-requests: 8
-      stream-interval: 50
-      decode-log-interval: 1000
-      disaggregation-mode: "prefill"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-    decode:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 9280
-      max-running-requests: 64
-      cuda-graph-max-bs: 64
-      stream-interval: 50
-      decode-log-interval: 1000
-      disaggregation-mode: "decode"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x4x8x16x32x64"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml
deleted file mode 100644
index a1fd14571..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,112 +0,0 @@
-name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16"
-
-# 8k/1k mid-throughput topology: 3 prefill (DP=8 EP=8) + 1 wide decode
-# (DP=16 EP=16). 10 nodes. Targets conc 512-1024 — 8k prompts saturate
-# a single prefill worker below conc=512.
-#
-# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference
-# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
-
-# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale.
-dynamo:
-  hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b
-  install: true
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 6
-  decode_nodes: 2
-  prefill_workers: 3
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: sglang
-
-  prefill_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
-
-  decode_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
-
-  sglang_config:
-    prefill:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 9280
-      max-running-requests: 8
-      stream-interval: 50
-      decode-log-interval: 1000
-      disaggregation-mode: "prefill"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-    decode:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      trust-remote-code: true
-      tensor-parallel-size: 8
-      moe-dense-tp-size: 1
-      enable-dp-attention: true
-      dp-size: 8
-      moe-runner-backend: "flashinfer_mxfp4"
-      chunked-prefill-size: 4096
-      disable-flashinfer-autotune: true
-      disable-radix-cache: true
-      mem-fraction-static: 0.82
-      context-length: 9280
-      max-running-requests: 256
-      cuda-graph-max-bs: 256
-      stream-interval: 50
-      decode-log-interval: 1000
-      disaggregation-mode: "decode"
-      disaggregation-bootstrap-port: 30001
-      disaggregation-transfer-backend: nixl
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "512x1024"
-  req_rate: "inf"
-  use_chat_template: false

From 7a1daaf4f5d0c74dde0d0552c422eab0048f222d Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 14:26:40 -0700
Subject: [PATCH 27/56] fix

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index aff5524b3..0166b3a60 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7721,7 +7721,7 @@ dsv4-fp4-gb300-dynamo-sglang:
   image: lmsysorg/sglang:deepseek-v4-grace-blackwell
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: gb300
+  runner: gb300-cw
   precision: fp4
   framework: dynamo-sglang
   multinode: true

From c454ad3e919122b2a8c11aeb9397ec1e469b814a Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 15:17:35 -0700
Subject: [PATCH 28/56] test

---
 perf-changelog.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a8a8bab49..1a4f0b78b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1948,12 +1948,12 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1202
 
 - config-keys:
-    - dsv4-fp4-gb200-dynamo-sglang
+    - dsv4-fp4-gb300-dynamo-sglang
   description:
-    - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)"
+    - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)"
     - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)"
-    - "Topologies mirror the dsv4-fp4-gb200-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B"
-    - "No upstream GB200 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB200 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime"
+    - "Topologies mirror the dsv4-fp4-gb300-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B"
+    - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157
 
 - config-keys:

From bac301d9ff58255821471dac3a00c20359a059ea Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 15:21:54 -0700
Subject: [PATCH 29/56] add gb300

---
 runners/launch_gb300-cw.sh | 278 +++++++++++++++++++++++++++++++++++++
 1 file changed, 278 insertions(+)
 create mode 100644 runners/launch_gb300-cw.sh

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
new file mode 100644
index 000000000..1b2d27939
--- /dev/null
+++ b/runners/launch_gb300-cw.sh
@@ -0,0 +1,278 @@
+#!/usr/bin/bash
+
+# Launches multi-node Dynamo + SGLang benchmarks on the gb300-cw
+# (CoreWeave) cluster. Adapted from the dynamo-vllm sibling launcher in
+# the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). The SGLang
+# recipes are copied exactly from the pinned srt-slurm commit below.
+
+set -x
+
+if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    # Weights staged on the shared VAST mount; no compute-node-local
+    # NVMe on cw. The exact upstream recipes refer to this model as
+    # `dspro`.
+    export MODEL_PATH="/mnt/vast/models/dsv4/"
+else
+    echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang"
+    exit 1
+fi
+
+# CoreWeave cluster has a single `all` partition; account `cw-sup` is
+# what `sacctmgr show assoc user=$USER` returns there. `benchmark`
+# (inherited from gb200-nv) does not exist on cw.
+export SLURM_PARTITION="all"
+export SLURM_ACCOUNT="cw-sup"
+
+# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env
+# to decide which host driver libraries (libcuda.so.1, libnvidia-*.so)
+# to mount into the container. cw doesn't set them by default — without
+# them the container has no libcuda and CUDA init fails. SLURM's default
+# --export=ALL propagates these from this shell through sbatch+srun
+# into the enroot environment.
+export NVIDIA_VISIBLE_DEVICES=all
+export NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+NGINX_IMAGE="nginx:1.27.4"
+SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16"
+
+# Squash files live alongside models on /mnt/vast (shared across nodes).
+# `squash_dupe` instead of `squash` to use '_'-separated names: srtctl /
+# pyxis rejects '+' in image paths with "Invalid image format", and the
+# old /mnt/vast/squash dir contains '+'-separated files from prior runs.
+SQUASH_DIR="/mnt/vast/squash_dupe"
+mkdir -p "$SQUASH_DIR"
+SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+
+enroot import -o $SQUASH_FILE docker://$IMAGE
+enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+
+export EVAL_ONLY="${EVAL_ONLY:-false}"
+
+export ISL="$ISL"
+export OSL="$OSL"
+
+# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML.
+# Without it, srtctl apply scans every YAML in the repo and submits
+# hundreds of jobs.
+if [[ -z "$CONFIG_FILE" ]]; then
+    echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2
+    echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2
+    exit 1
+fi
+
+echo "Cloning srt-slurm repository..."
+SRT_REPO_DIR="srt-slurm"
+if [ -d "$SRT_REPO_DIR" ]; then
+    echo "Removing existing $SRT_REPO_DIR..."
+    rm -rf "$SRT_REPO_DIR"
+fi
+
+git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+cd "$SRT_REPO_DIR"
+git checkout "$SRT_SLURM_RECIPES_COMMIT"
+
+# Overlay the local copy of the exact pinned recipes. This keeps the PR
+# self-contained while preserving byte-for-byte recipe content from
+# NVIDIA/srt-slurm at $SRT_SLURM_RECIPES_COMMIT.
+cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4" recipes/dsv4-pro/sglang/gb200-fp4
+
+echo "Installing srtctl..."
+# CRITICAL — uv install location.
+# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is
+# shared NFS across both. srtctl's slurm template (job_script_minimal.j2)
+# does `if ! command -v uv` and skips its own ARM64 install when uv is
+# already on PATH; on compute nodes $HOME/.local/bin is on PATH by
+# default, so a stray x86 binary at $HOME/.local/bin/uv from this
+# runner shadows the template's install and crashes the orchestrator
+# with `cannot execute binary file: Exec format error`. Install to a
+# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86
+# uv left in the shared path by prior runs.
+rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx"
+export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin"
+mkdir -p "$XDG_BIN_HOME"
+curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh
+export PATH="$XDG_BIN_HOME:$PATH"
+
+if [ ! -x "$XDG_BIN_HOME/uv" ]; then
+    echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2
+    exit 1
+fi
+if [ -e "$HOME/.local/bin/uv" ]; then
+    echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2
+    exit 1
+fi
+
+uv venv
+source .venv/bin/activate
+uv pip install -e .
+
+if ! command -v srtctl &> /dev/null; then
+    echo "Error: Failed to install srtctl"
+    exit 1
+fi
+
+echo "Configs available at: $SRT_REPO_DIR/"
+
+SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
+echo "Creating srtslurm.yaml configuration..."
+cat > srtslurm.yaml <<EOF
+# SRT SLURM Configuration for GB300-CW (SGLang)
+
+default_account: "${SLURM_ACCOUNT}"
+default_partition: "${SLURM_PARTITION}"
+default_time_limit: "8:00:00"
+
+gpus_per_node: 4
+network_interface: ""
+
+srtctl_root: "${SRTCTL_ROOT}"
+
+model_paths:
+  dspro: "${MODEL_PATH}"
+  dsv4-pro: "${MODEL_PATH}"
+containers:
+  dynamo-trtllm: ${SQUASH_FILE}
+  dynamo-sglang: ${SQUASH_FILE}
+  dspro-0426: ${SQUASH_FILE}
+  dspro-0426-nixl: ${SQUASH_FILE}
+  dsv4-grace-blackwell: ${SQUASH_FILE}
+  "${IMAGE}": ${SQUASH_FILE}
+  nginx: ${NGINX_SQUASH_FILE}
+  nginx-sqsh: ${NGINX_SQUASH_FILE}
+# Use one contiguous CW segment for the full allocation. This is a
+# cluster-level setting, not a recipe overlay; the copied recipe files
+# stay byte-identical to the pinned upstream commit.
+use_segment_sbatch_directive: true
+EOF
+
+echo "Generated srtslurm.yaml:"
+cat srtslurm.yaml
+
+echo "Running make setup..."
+make setup ARCH=aarch64
+
+# Export eval-related env vars for srt-slurm post-benchmark eval
+export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
+
+echo "Submitting job with srtctl..."
+
+# Use the runner name for the submitted job. Some exact upstream recipes do
+# not define `name`, so insert it into only the cloned runtime copy.
+if grep -q '^name:' "$CONFIG_FILE"; then
+    sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
+else
+    TMP_CONFIG_FILE="$(mktemp)"
+    awk -v runner_name="${RUNNER_NAME}" 'BEGIN { print "name: \"" runner_name "\"" } { print }' "$CONFIG_FILE" > "$TMP_CONFIG_FILE"
+    mv "$TMP_CONFIG_FILE" "$CONFIG_FILE"
+fi
+
+SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+echo "$SRTCTL_OUTPUT"
+
+JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')
+
+set +x
+
+if [ -z "$JOB_ID" ]; then
+    echo "Error: Failed to extract JOB_ID from srtctl output"
+    exit 1
+fi
+
+echo "Extracted JOB_ID: $JOB_ID"
+
+LOGS_DIR="outputs/$JOB_ID/logs"
+LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log"
+
+while ! ls "$LOG_FILE" &>/dev/null; do
+    if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then
+        echo "ERROR: Job $JOB_ID failed before creating log file"
+        scontrol show job "$JOB_ID"
+        exit 1
+    fi
+    echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..."
+    sleep 5
+done
+
+(
+    while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do
+        sleep 10
+    done
+) &
+POLL_PID=$!
+
+echo "Tailing LOG_FILE: $LOG_FILE"
+
+tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null
+
+wait $POLL_PID
+
+set -x
+
+echo "Job $JOB_ID completed!"
+echo "Collecting results..."
+
+if [ -d "$LOGS_DIR" ]; then
+    echo "Found logs directory: $LOGS_DIR"
+    cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS"
+    tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" .
+else
+    echo "Warning: Logs directory not found at $LOGS_DIR"
+fi
+
+if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
+    if [ ! -d "$LOGS_DIR" ]; then
+        exit 1
+    fi
+
+    RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null)
+
+    if [ -z "$RESULT_SUBDIRS" ]; then
+        echo "Warning: No result subdirectories found in $LOGS_DIR"
+    else
+        for result_subdir in $RESULT_SUBDIRS; do
+            echo "Processing result subdirectory: $result_subdir"
+
+            CONFIG_NAME=$(basename "$result_subdir")
+
+            RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null)
+
+            for result_file in $RESULT_FILES; do
+                if [ -f "$result_file" ]; then
+                    filename=$(basename "$result_file")
+                    concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p')
+                    gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p')
+                    ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p')
+                    gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p')
+
+                    echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file"
+
+                    WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json"
+                    cp "$result_file" "$WORKSPACE_RESULT_FILE"
+
+                    echo "Copied result file to: $WORKSPACE_RESULT_FILE"
+                fi
+            done
+        done
+    fi
+
+    echo "All result files processed"
+else
+    echo "EVAL_ONLY=true: Skipping benchmark result collection"
+fi
+
+if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then
+    EVAL_DIR="$LOGS_DIR/eval_results"
+    if [ -d "$EVAL_DIR" ]; then
+        echo "Extracting eval results from $EVAL_DIR"
+        shopt -s nullglob
+        for eval_file in "$EVAL_DIR"/*; do
+            [ -f "$eval_file" ] || continue
+            cp "$eval_file" "$GITHUB_WORKSPACE/"
+            echo "Copied eval artifact: $(basename "$eval_file")"
+        done
+        shopt -u nullglob
+    else
+        echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR"
+    fi
+fi

From 1167f6471b9a9118594f9b7e78dea71a2e19e299 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 15:49:07 -0700
Subject: [PATCH 30/56] upd

---
 .github/configs/runners.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 60f3299cf..f574c629c 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -139,3 +139,8 @@ gb300:
 - 'gb300-nv_0'
 - 'gb300-nv_1'
 - 'gb300-nv_2'
+gb300-cw:
+- 'gb300-cw_0'
+- 'gb300-cw_1'
+- 'gb300-cw_2'
+- 'gb300-cw_3'

From cfae9ae0205411250003ac11e2663e7e4227734e Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 15:54:08 -0700
Subject: [PATCH 31/56] fix

---
 runners/launch_gb300-cw.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 1b2d27939..a9bb8996f 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -19,7 +19,7 @@ fi
 
 # CoreWeave cluster has a single `all` partition; account `cw-sup` is
 # what `sacctmgr show assoc user=$USER` returns there. `benchmark`
-# (inherited from gb200-nv) does not exist on cw.
+# (inherited from gb300-nv) does not exist on cw.
 export SLURM_PARTITION="all"
 export SLURM_ACCOUNT="cw-sup"
 
@@ -75,7 +75,7 @@ git checkout "$SRT_SLURM_RECIPES_COMMIT"
 # Overlay the local copy of the exact pinned recipes. This keeps the PR
 # self-contained while preserving byte-for-byte recipe content from
 # NVIDIA/srt-slurm at $SRT_SLURM_RECIPES_COMMIT.
-cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4" recipes/dsv4-pro/sglang/gb200-fp4
+cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb300-fp4" recipes/dsv4-pro/sglang/gb300-fp4
 
 echo "Installing srtctl..."
 # CRITICAL — uv install location.

From 0443a1f2d4890093716a95309b85620df6041a44 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 16:01:06 -0700
Subject: [PATCH 32/56] fix

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 509a401a5..bc50f4670 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7777,7 +7777,7 @@ dsv4-fp4-gb300-dynamo-sglang:
         ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-2p1d-dep4-dep8.yaml"
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml"
       decode:
         num-worker: 1
         tp: 8

From 387726da7108242896364196f09fb7a688fbca49 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Tue, 28 Apr 2026 17:07:41 -0700
Subject: [PATCH 33/56] fix

---
 runners/launch_gb300-cw.sh | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index a9bb8996f..62869cb47 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -72,10 +72,13 @@ git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
 git checkout "$SRT_SLURM_RECIPES_COMMIT"
 
-# Overlay the local copy of the exact pinned recipes. This keeps the PR
-# self-contained while preserving byte-for-byte recipe content from
-# NVIDIA/srt-slurm at $SRT_SLURM_RECIPES_COMMIT.
-cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb300-fp4" recipes/dsv4-pro/sglang/gb300-fp4
+# Overlay the hand-rolled DSV4 sglang recipes onto the upstream srt-slurm
+# checkout. Mirrors launch_gb200-nv.sh's dynamo-sglang dsv4 branch:
+# destination must be `recipes/sglang/deepseek-v4` because
+# `additional-settings: CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/...`
+# in `.github/configs/nvidia-master.yaml` is what srtctl loads.
+mkdir -p recipes/sglang/deepseek-v4
+cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4
 
 echo "Installing srtctl..."
 # CRITICAL — uv install location.

From fe6815c2a404fac7094166a0797f8fdd6f2a1a47 Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Wed, 29 Apr 2026 08:14:13 +0800
Subject: [PATCH 34/56] fix(launch_gb300-cw): register deepseek-v4-pro alias in
 model_paths

After fixing the recipe overlay path in 1b07108, srtctl now loads our
hand-rolled SGLang recipe and runs preflight, which rejects:

    Error: Preflight failed for recipes/sglang/.../disagg-gb300-2p1d-dep4-dep8.yaml:
    - model.path: Model 'deepseek-v4-pro' is not a local model path and
      is not defined in srtslurm.yaml model_paths.

Both `disagg-gb300-2p1d-dep4-dep8.yaml` and `disagg-gb300-7p1d-dep4-dep8.yaml`
declare `model.path: deepseek-v4-pro` (per the recipe header comment, the
alias is intentionally aligned with `launch_gb200-nv.sh`'s srtslurm.yaml,
which exports `SRT_SLURM_MODEL_PREFIX=deepseek-v4-pro`). The gb300-cw
launcher only registered `dspro` and `dsv4-pro`, so the alias never
resolved. Add `deepseek-v4-pro` mapping to the same `${MODEL_PATH}`.
---
 runners/launch_gb300-cw.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 62869cb47..a6ec57f3c 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -134,6 +134,12 @@ srtctl_root: "${SRTCTL_ROOT}"
 model_paths:
   dspro: "${MODEL_PATH}"
   dsv4-pro: "${MODEL_PATH}"
+  # Our hand-rolled DSV4 sglang recipes use `model.path: deepseek-v4-pro`
+  # (matches the alias in launch_gb200-nv.sh's srtslurm.yaml). Without
+  # this entry srtctl preflight rejects with "Model 'deepseek-v4-pro'
+  # is not a local model path and is not defined in srtslurm.yaml
+  # model_paths".
+  deepseek-v4-pro: "${MODEL_PATH}"
 containers:
   dynamo-trtllm: ${SQUASH_FILE}
   dynamo-sglang: ${SQUASH_FILE}

From b4d6c1966e21f255df8889ad001c513ac4048fc4 Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Wed, 29 Apr 2026 08:27:44 +0800
Subject: [PATCH 35/56] fix(launch_gb300-cw): pull arm64 squash and force fresh
 import per runner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After fixing model.path alias (fe6815c), the slurm orchestrator reached
the head infrastructure srun and crashed at:

    [ERROR] Invalid image format: /mnt/vast/squash_dupe/lmsysorg_sglang_deepseek-v4-grace-blackwell.sqsh
    error: pyxis: failed to create container filesystem
    error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1

Two issues:

1. The runner pod that runs `enroot import docker://lmsysorg/sglang:...`
   is x86, so without `--arch` enroot fetches the amd64 manifest. The
   compute nodes (slurm-gb300-138-*) are aarch64 and pyxis there
   rejects the amd64 squash with "Invalid image format". Pass
   `--arch arm64` and tag the cache filename with `_arm64`.

2. `enroot import -o existing.sqsh ...` aborts with
   `[ERROR] File already exists` and leaves the stale file in place,
   so once a half-baked or pre-tag-update squash lands at this path it
   is silently reused on every subsequent CI run. Inspecting
   /mnt/vast/squash_dupe showed an Apr 26 amd64 sqsh shadowing the
   Apr 28 working arm64 sqsh exactly like this. `rm -f` before each
   import forces fresh downloads and picks up Docker tag updates.

3. Scope the squash filename per RUNNER_NAME (gb300-cw_0..3) so that
   the four matrix runners do not race on rm+import of the same shared
   path on /mnt/vast. Cost: ~64 GB on /mnt/vast (4 runners × 16 GB
   per arm64 sqsh) instead of 16 GB shared, which is fine on the
   shared VAST mount.
---
 runners/launch_gb300-cw.sh | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index a6ec57f3c..d9c6dbd17 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -41,11 +41,26 @@ SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16"
 # old /mnt/vast/squash dir contains '+'-separated files from prior runs.
 SQUASH_DIR="/mnt/vast/squash_dupe"
 mkdir -p "$SQUASH_DIR"
-SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
-
-enroot import -o $SQUASH_FILE docker://$IMAGE
-enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+# Compute nodes (slurm-gb300-138-*, slurm-gb300-139-*) are aarch64; the
+# CI runner pod that performs `enroot import` is x86. Without --arch,
+# enroot pulls the host (amd64) manifest and produces a sqsh that pyxis
+# on the compute node rejects with "Invalid image format". Force enroot
+# to pull the arm64 manifest so the cached sqsh is portable to compute
+# nodes. The `_${RUNNER_NAME}_arm64` suffix scopes the cache per runner
+# (gb300-cw_0..3) so concurrent matrix jobs don't rm+import the same
+# file and corrupt each other's downloads.
+SQUASH_TAG="${RUNNER_NAME:-default}_arm64"
+SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_${SQUASH_TAG}.sqsh"
+NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_${SQUASH_TAG}.sqsh"
+
+# Always rebuild the squash from scratch — `enroot import` aborts with
+# `[ERROR] File already exists` when targeting an existing path, so
+# leaving a stale (interrupted import / pre-update tag) sqsh in place
+# silently keeps using the broken file. rm + import guarantees a fresh
+# import each CI run and picks up Docker tag updates.
+rm -f "$SQUASH_FILE" "$NGINX_SQUASH_FILE"
+enroot import --arch arm64 -o "$SQUASH_FILE" "docker://$IMAGE"
+enroot import --arch arm64 -o "$NGINX_SQUASH_FILE" "docker://$NGINX_IMAGE"
 
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 

From cad94c937995e0ca9b470ff02520ac27f3a45b87 Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Wed, 29 Apr 2026 08:31:21 +0800
Subject: [PATCH 36/56] fix(launch_gb300-cw): use enroot --arch aarch64, not
 arm64

enroot 4.0.1's `common::debarch()` accepts kernel-style arch names
(`x86_64`, `aarch64`, `ppc64le`) and emits Docker-style names
(`amd64`, `arm64`, `ppc64le`) on the wire. Passing `--arch arm64` (the
Docker manifest name) trips the function's else branch immediately:

    [ERROR] Unsupported architecture: arm64

Use the kernel name `aarch64` so enroot can map it to docker's `arm64`
manifest internally.
---
 runners/launch_gb300-cw.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index d9c6dbd17..9b00e21bd 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -59,8 +59,8 @@ NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_${SQ
 # silently keeps using the broken file. rm + import guarantees a fresh
 # import each CI run and picks up Docker tag updates.
 rm -f "$SQUASH_FILE" "$NGINX_SQUASH_FILE"
-enroot import --arch arm64 -o "$SQUASH_FILE" "docker://$IMAGE"
-enroot import --arch arm64 -o "$NGINX_SQUASH_FILE" "docker://$NGINX_IMAGE"
+enroot import --arch aarch64 -o "$SQUASH_FILE" "docker://$IMAGE"
+enroot import --arch aarch64 -o "$NGINX_SQUASH_FILE" "docker://$NGINX_IMAGE"
 
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 

From d6fc0e7eb026433a77143bd6c1d9c4b1b3e15794 Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Wed, 29 Apr 2026 08:35:54 +0800
Subject: [PATCH 37/56] fix(launch_gb300-cw): use pre-staged arm64 sqsh, drop
 in-CI enroot import

Even with `--arch aarch64`, `enroot import` from the CI runner pod (x86)
fails when converting the arm64 image:

    [INFO] Converting whiteouts...
    /usr/bin/bash: line 1: /usr/bin/enroot-aufs2ovlfs: Operation not permitted
    (repeated dozens of times, then preflight reports the sqsh as missing)

`enroot-aufs2ovlfs` requires CAP_SYS_ADMIN that the runner pod doesn't
hold, and `lmsysorg/sglang:deepseek-v4-grace-blackwell` is arm64-only,
so the conversion can't be skipped either. Per the documented manual
flow at https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780
the import has to be dispatched to an aarch64 GB300 compute node via
`srun`.

Rather than running an extra slurm job per CI invocation just to
prepare the sqsh, point the launcher at the pre-staged arm64 sqsh that
already lives at
`/mnt/vast/squash_dupe/lmsysorg_sglang_deepseek-v4-grace-blackwell_arm64.sqsh`
(refreshed manually via the gist script when the docker tag is bumped).
The matching `nginx_1.27.4_arm64.sqsh` was symlinked alongside.

Add a fast-fail check so a missing pre-staged sqsh produces a clear
error instead of a confusing pyxis "Invalid image format" three steps
later.
---
 runners/launch_gb300-cw.sh | 45 ++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 9b00e21bd..5ff3fe5e9 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -42,25 +42,32 @@ SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16"
 SQUASH_DIR="/mnt/vast/squash_dupe"
 mkdir -p "$SQUASH_DIR"
 # Compute nodes (slurm-gb300-138-*, slurm-gb300-139-*) are aarch64; the
-# CI runner pod that performs `enroot import` is x86. Without --arch,
-# enroot pulls the host (amd64) manifest and produces a sqsh that pyxis
-# on the compute node rejects with "Invalid image format". Force enroot
-# to pull the arm64 manifest so the cached sqsh is portable to compute
-# nodes. The `_${RUNNER_NAME}_arm64` suffix scopes the cache per runner
-# (gb300-cw_0..3) so concurrent matrix jobs don't rm+import the same
-# file and corrupt each other's downloads.
-SQUASH_TAG="${RUNNER_NAME:-default}_arm64"
-SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_${SQUASH_TAG}.sqsh"
-NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_${SQUASH_TAG}.sqsh"
-
-# Always rebuild the squash from scratch — `enroot import` aborts with
-# `[ERROR] File already exists` when targeting an existing path, so
-# leaving a stale (interrupted import / pre-update tag) sqsh in place
-# silently keeps using the broken file. rm + import guarantees a fresh
-# import each CI run and picks up Docker tag updates.
-rm -f "$SQUASH_FILE" "$NGINX_SQUASH_FILE"
-enroot import --arch aarch64 -o "$SQUASH_FILE" "docker://$IMAGE"
-enroot import --arch aarch64 -o "$NGINX_SQUASH_FILE" "docker://$NGINX_IMAGE"
+# image `lmsysorg/sglang:deepseek-v4-grace-blackwell` is published as
+# arm64-only. The CI runner pod is x86_64 and (a) cannot run
+# `enroot import` for the arm64 manifest because `enroot-aufs2ovlfs`
+# needs CAP_SYS_ADMIN that the pod lacks ("Operation not permitted"),
+# and (b) even with `--arch aarch64` the conversion still fails on x86.
+# Per `https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780`
+# the import has to be dispatched to an arm64 compute node via srun.
+# To keep CI self-contained we instead pin to the pre-staged arm64 sqsh
+# under /mnt/vast/squash_dupe/ (refreshed manually by running that gist
+# script when the docker tag is updated). Filename suffix `_arm64`
+# distinguishes the working arm64 sqsh from any stale amd64 shadow.
+SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh"
+NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh"
+
+if [[ ! -f "$SQUASH_FILE" ]]; then
+    echo "ERROR: pre-staged arm64 sqsh missing: $SQUASH_FILE" >&2
+    echo "Refresh it on a GB300 compute node via the script in the gist:" >&2
+    echo "  https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780" >&2
+    exit 1
+fi
+if [[ ! -f "$NGINX_SQUASH_FILE" ]]; then
+    echo "ERROR: pre-staged arm64 nginx sqsh missing: $NGINX_SQUASH_FILE" >&2
+    echo "Run on an aarch64 host:" >&2
+    echo "  enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" >&2
+    exit 1
+fi
 
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 

From da6f892b26d2755488c6b673e0c0d9ed6a594e3b Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Wed, 29 Apr 2026 08:48:16 +0800
Subject: [PATCH 38/56] fix(launch_gb300-cw): persist dynamo wheel cache and
 ulimit preamble
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-up fixes after CI started successfully reaching slurm but
the dynamo-from-source step (`dynamo: hash: 9d3c913d…`) is rebuilt cold
on every CI run, taking ~10-20 minutes per matrix job:

1. Cluster-wide dynamo wheel cache. srtctl's
   `_hash_cached_source_install` (`src/srtctl/core/schema.py:912`) is
   already designed to cache hash-pinned builds at
   `/configs/dynamo-wheels/<hash>/{ai_dynamo_runtime-*.whl,dynamo-src.tar.gz,.complete}`
   under flock. The cache only works if `/configs/dynamo-wheels` survives
   between CI runs, but the launcher does `rm -rf srt-slurm` and
   re-clones every time, blowing it away. Mount
   `/mnt/vast/dynamo-wheels-cache` (NFS, shared by every gb300-cw_N
   runner) over `/configs/dynamo-wheels` via srtslurm.yaml
   `default_mounts`, so the cache survives `rm -rf` and is shared
   across all matrix jobs. After the first cold build the warm path
   should drop dynamo install to ~30 s.

2. Cluster-wide bash preamble for ulimits. yangminl's manual setup on
   this cluster (`/mnt/home/yangminl/srt-slurm/srtslurm.yaml`) sets
   `default_bash_preamble: "ulimit -n 1048576 && ulimit -a"` so the
   dynamo frontend / sglang servers can accept the 8192-concurrency
   sweep without `EMFILE: too many open files`. Mirror that here. The
   feature is supported by srtctl's pinned commit
   (`src/srtctl/core/slurm.py:_get_cluster_bash_preamble`).
---
 runners/launch_gb300-cw.sh | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 5ff3fe5e9..529570e8a 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -140,6 +140,20 @@ fi
 echo "Configs available at: $SRT_REPO_DIR/"
 
 SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
+
+# Persistent cluster-wide cache for `dynamo: hash:` source builds. The
+# upstream cache root (_DYNAMO_CACHE_ROOT in srtctl/core/schema.py) is
+# `/configs/dynamo-wheels`; without an override that dir lives inside
+# `srt-slurm/configs`, which the launcher wipes via `rm -rf` every CI
+# run, so each run does a cold ~10-20 min rust+pyo3 build. Stage the
+# cache on /mnt/vast (NFS, shared by all gb300-cw_N runners) and have
+# srtctl bind-mount it over `/configs/dynamo-wheels` via the cluster
+# `default_mounts` setting. flock inside srtctl serializes cold-cache
+# builds across concurrent matrix jobs.
+DYNAMO_WHEELS_CACHE_HOST="/mnt/vast/dynamo-wheels-cache"
+mkdir -p "$DYNAMO_WHEELS_CACHE_HOST"
+mkdir -p configs/dynamo-wheels
+
 echo "Creating srtslurm.yaml configuration..."
 cat > srtslurm.yaml <<EOF
 # SRT SLURM Configuration for GB300-CW (SGLang)
@@ -153,6 +167,9 @@ network_interface: ""
 
 srtctl_root: "${SRTCTL_ROOT}"
 
+default_mounts:
+  ${DYNAMO_WHEELS_CACHE_HOST}: /configs/dynamo-wheels
+
 model_paths:
   dspro: "${MODEL_PATH}"
   dsv4-pro: "${MODEL_PATH}"
@@ -175,6 +192,12 @@ containers:
 # cluster-level setting, not a recipe overlay; the copied recipe files
 # stay byte-identical to the pinned upstream commit.
 use_segment_sbatch_directive: true
+# Cluster-wide bash preamble — runs before every container srun. Raises
+# NOFILE so the dynamo frontend / sglang servers can accept high
+# concurrency (8192 in the 7p1d sweep) without EMFILE / "too many open
+# files". Mirrors what `yangminl@slurm-login-0:~/srt-slurm/srtslurm.yaml`
+# already uses for manual runs on this cluster.
+default_bash_preamble: "ulimit -n 1048576 && ulimit -a"
 EOF
 
 echo "Generated srtslurm.yaml:"

From 28d03e826e72df58b0ca650fd435cfccff2d4aa5 Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Wed, 29 Apr 2026 09:12:26 +0800
Subject: [PATCH 39/56] fix(sglang/dsv4/8k1k recipes): set cpus-per-task=144
 for dynamo build

slurm assigns 1 CPU/task by default; `scontrol show job <id>` from a
recent CI run shows `NumCPUs=4 NumTasks=4 CPUs/Task=1` with 4 nodes,
i.e. one core per worker. The dynamo `hash:` source install rebuilds
~500 rust crates (kube-client, tonic, hf-hub, image codecs ravif/exr,
pyo3 stack) and at one core takes 30+ min just for the cold build,
which dominates total CI time even with the new
`/configs/dynamo-wheels` cache (the cache only helps after the first
cold run).

Match yangminl's working manual setup
(`/mnt/home/yangminl/srt-slurm/recipes/dsv4-pro/sglang/gb300-fp4/all-dynamo.yaml`)
which sets `sbatch_directives.cpus-per-task: "144"` so cargo gets the
full GB300 host (144 cores) and finishes maturin in a few minutes.
---
 .../deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml       | 6 ++++++
 .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml       | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
index bceffd528..a2ad5d45b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
@@ -34,6 +34,12 @@ dynamo:
 slurm:
   time_limit: "8:00:00"
 
+# Without cpus-per-task slurm gives 1 CPU/task; the dynamo cold source
+# build (~500 rust crates including ravif/exr/zip) is otherwise serial
+# and takes 30+ min. Match yangminl's all-dynamo.yaml which uses 144.
+sbatch_directives:
+  cpus-per-task: "144"
+
 health_check:
   max_attempts: 1440
   interval_seconds: 10
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
index 731adeb13..8d0fae386 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
@@ -34,6 +34,12 @@ dynamo:
 slurm:
   time_limit: "8:00:00"
 
+# Without cpus-per-task slurm gives 1 CPU/task; the dynamo cold source
+# build (~500 rust crates including ravif/exr/zip) is otherwise serial
+# and takes 30+ min. Match yangminl's all-dynamo.yaml which uses 144.
+sbatch_directives:
+  cpus-per-task: "144"
+
 health_check:
   max_attempts: 1440
   interval_seconds: 10

From 16113f810b7ef51df3509a9bec5d97ae8537de12 Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Wed, 29 Apr 2026 09:14:22 +0800
Subject: [PATCH 40/56] fix(sglang/dsv4/8k1k recipes): set cpus-per-task=144
 and mem=0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

slurm assigns 1 CPU/task by default; `scontrol show job 613` from a
running CI job confirmed `NumCPUs=4 NumTasks=4 CPUs/Task=1` with 4
nodes — one core per worker. The dynamo `hash:` cold source install
rebuilds ~500 rust crates (kube-client, tonic, hf-hub, image codecs
ravif/exr, the pyo3 stack) and at one core takes 30+ min just for the
cold build, which dominates total CI time even with the new
`/configs/dynamo-wheels` cache (the cache only helps after the first
cold run).

Match yangminl's working manual setup on the same gb300-cw cluster
(`/mnt/home/yangminl/srt-slurm/recipes/dsv4-pro/sglang/gb300-fp4/all-dynamo.yaml`)
which sets:
  sbatch_directives:
    cpus-per-task: "144"
    mem: "0"

cargo then gets the full 144-core GB300 host and finishes maturin in a
few minutes; mem=0 hands the worker the entire node's RAM so the
dynamo build + DSV4-Pro 671B FP4 weight load fit without OOM.
---
 .../8k1k/disagg-gb300-2p1d-dep4-dep8.yaml            | 12 +++++++++---
 .../8k1k/disagg-gb300-7p1d-dep4-dep8.yaml            | 12 +++++++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
index a2ad5d45b..bd5a95715 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
@@ -34,11 +34,17 @@ dynamo:
 slurm:
   time_limit: "8:00:00"
 
-# Without cpus-per-task slurm gives 1 CPU/task; the dynamo cold source
-# build (~500 rust crates including ravif/exr/zip) is otherwise serial
-# and takes 30+ min. Match yangminl's all-dynamo.yaml which uses 144.
+# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
+#   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
+#     turns the dynamo `hash:` cold source build (~500 rust crates,
+#     ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144
+#     cargo finishes in ~5 min.
+#   mem=0              — slurm's "give the whole node's memory"; needed
+#     for sglang loading 671B FP4 weights + dynamo build at the same
+#     time without OOM.
 sbatch_directives:
   cpus-per-task: "144"
+  mem: "0"
 
 health_check:
   max_attempts: 1440
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
index 8d0fae386..9d59cbdc3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
@@ -34,11 +34,17 @@ dynamo:
 slurm:
   time_limit: "8:00:00"
 
-# Without cpus-per-task slurm gives 1 CPU/task; the dynamo cold source
-# build (~500 rust crates including ravif/exr/zip) is otherwise serial
-# and takes 30+ min. Match yangminl's all-dynamo.yaml which uses 144.
+# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
+#   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
+#     turns the dynamo `hash:` cold source build (~500 rust crates,
+#     ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144
+#     cargo finishes in ~5 min.
+#   mem=0              — slurm's "give the whole node's memory"; needed
+#     for sglang loading 671B FP4 weights + dynamo build at the same
+#     time without OOM.
 sbatch_directives:
   cpus-per-task: "144"
+  mem: "0"
 
 health_check:
   max_attempts: 1440

From ade5488d24f463ed169e4f8456924257879ec731 Mon Sep 17 00:00:00 2001
From: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
Date: Wed, 29 Apr 2026 10:25:05 +0800
Subject: [PATCH 41/56] fix(launch_gb300-cw): pin srt-slurm fork with parallel
 sa-bench

The current sa-bench in NVIDIA/srt-slurm@9d75f82 generates random
prompts single-threaded, which dominates 7p1d/conc=8192 bench startup
(~50 min just for the 81920-prompt main pass before the first HTTP
request reaches dynamo). Pin to fzyzcjy/srt-slurm fork branch
`feat/random-num-workers` (commit 8094cfb), which is 9d75f82 + the
SemiAnalysisAI/InferenceX `utils/bench_serving/` benchmark_serving.py
ported into sa-bench. With `--random-num-workers 48` (now the default
in bench.sh) prompt generation drops to ~1 min on a 144-core GB300
host, putting the bench-startup cost on the same order as
infra+model-load instead of dominating it.

The fork is paired with the upstream PR
https://github.com/NVIDIA/srt-slurm/pull/114; once that merges, this
pin should revert to the bumped NVIDIA/srt-slurm SHA.
---
 runners/launch_gb300-cw.sh | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 529570e8a..053cfaecf 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -33,7 +33,18 @@ export NVIDIA_VISIBLE_DEVICES=all
 export NVIDIA_DRIVER_CAPABILITIES=compute,utility
 
 NGINX_IMAGE="nginx:1.27.4"
-SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16"
+# Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers`
+# (= NVIDIA/srt-slurm@9d75f82 + sa-bench parallel random prompt
+# generation). The single-threaded random prompt generator in the
+# upstream sa-bench dominates bench startup on the 7p1d/conc=8192
+# sweep (~50 min for the main pass alone before the first HTTP
+# request leaves the client). The fork bumps that to ~1 min via
+# multiprocessing.Pool with `--random-num-workers 48`.
+#
+# TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR
+# (https://github.com/NVIDIA/srt-slurm/pull/114) merges.
+SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git"
+SRT_SLURM_RECIPES_COMMIT="8094cfb1db7cad76fbf9ecb41c0c7e662dad301e"
 
 # Squash files live alongside models on /mnt/vast (shared across nodes).
 # `squash_dupe` instead of `squash` to use '_'-separated names: srtctl /
@@ -90,7 +101,7 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+git clone "$SRT_SLURM_RECIPES_REPO" "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
 git checkout "$SRT_SLURM_RECIPES_COMMIT"
 

From 152a059d5d170799fd0b64c52c3a2f4ab99f1358 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 29 Apr 2026 10:57:26 +0800
Subject: [PATCH 42/56] fix(launch_gb300-cw): bump srt-slurm fork pin to
 minimal multiproc patch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous pin (8094cfb) was a wholesale replacement of sa-bench with
the SemiAnalysisAI/InferenceX bench_serving — that dropped
`async_request_dynamo_completions` from `ASYNC_REQUEST_FUNCS`, so
`bench.sh` would have died on `--backend dynamo` argparse rejection
the moment the bench client started.

New pin (4249d16) is a tight ~100-line patch on top of
NVIDIA/srt-slurm@9d75f82 that only adds parallel random prompt
generation (`--random-num-workers`); everything else, including the
dynamo backend and `--custom-tokenizer` plumbing, stays exactly the
same as upstream. See https://github.com/NVIDIA/srt-slurm/pull/114.
---
 runners/launch_gb300-cw.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 053cfaecf..ef7260bcb 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -44,7 +44,7 @@ NGINX_IMAGE="nginx:1.27.4"
 # TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR
 # (https://github.com/NVIDIA/srt-slurm/pull/114) merges.
 SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git"
-SRT_SLURM_RECIPES_COMMIT="8094cfb1db7cad76fbf9ecb41c0c7e662dad301e"
+SRT_SLURM_RECIPES_COMMIT="4249d168208ff5ff1f30b3c1158d893cc0615bb5"
 
 # Squash files live alongside models on /mnt/vast (shared across nodes).
 # `squash_dupe` instead of `squash` to use '_'-separated names: srtctl /

From c435a65db23bb3b247734babbdcbd1ba8438ccb8 Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 29 Apr 2026 10:58:47 +0800
Subject: [PATCH 43/56] ci: temporarily comment out conc-list:[64] 2p1d entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Focus CI on the conc=8192 7p1d max-throughput entry only — re-enable
the 2p1d/conc=64 mid-curve entry shortly once that's green.
---
 .github/configs/nvidia-master.yaml | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index bb4e5e1f4..655538929 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7723,19 +7723,21 @@ dsv4-fp4-gb300-dynamo-sglang:
     #     ep: 1
     #     dp-attn: true
     # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
-    - conc-list: [64]
-      prefill:
-        num-worker: 2
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
+    # TEMPORARILY COMMENTED OUT — focusing CI on the conc=8192 7p1d
+    # max-throughput entry only. Re-enable shortly once that's green.
+    # - conc-list: [64]
+    #   prefill:
+    #     num-worker: 2
+    #     tp: 4
+    #     ep: 4
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
     # Max throughput: 7 prefills (TP=4 / DP=4 / EP=4) + 1 decode
     # (TP=8 / DP=8 / EP=8 wideep). 9 nodes. Refreshed by PR #1213.
     - conc-list: [8192]

From be12dbaceef84726e94751d4e70e25d8d45d5b8e Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 29 Apr 2026 12:37:03 +0800
Subject: [PATCH 44/56] ci(eval): temporarily skip dsv4-fp4-gb300 dynamo-sglang
 eval-only entry

The srt-slurm pin (9d75f82, recipes/dsv4-agg-disagg) lacks the lm-eval
orchestrator path that lives on sa-submission-q2-2026. Skip the auto-generated
eval-only matrix entry for this config until the pin is bumped.

TODO: remove this branch once the pin is moved to sa-submission-q2-2026 (which
already carries the EVAL_ONLY do_sweep.py branch and lm-eval/bench.sh).
---
 utils/matrix_logic/generate_sweep_configs.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index e543bb4af..e9a2195ed 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -114,7 +114,11 @@ def _max_eval_conc(ie):
         )
         mn_groups[key].append((i, entry))
 
-    for entries in mn_groups.values():
+    for key, entries in mn_groups.items():
+        # TODO(pr1157): srt-slurm pin (9d75f82) lacks the lm-eval orchestrator path
+        # (only on sa-submission-q2-2026). Skip eval-only here until the pin is bumped.
+        if key[:3] == ("deepseek-ai/DeepSeek-V4-Pro", "gb300-cw", "dynamo-sglang"):
+            continue
         best_idx, best_entry = max(entries, key=_max_eval_conc)
         eval_indices.add(best_idx)
         # Set eval-conc to median of eligible conc values to avoid OOM during eval

From 38acd774c55d3ac245f4da91dac1e92d08daceed Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 29 Apr 2026 12:17:57 +0800
Subject: [PATCH 45/56] bench(7p1d-dep4-dep8): swap sa-bench default for
 yangminl's gb300-cw recipe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the sa-bench builder (concurrencies=8192, req_rate=inf, sa-bench
default num_prompts/num_warmups multipliers) with the exact custom
command from yangminl's gb300-cw 8k1k_hightpt[0] run (slurm job 564 on
the dsv4-pro-gb300-fp4 cluster):

  concurrency=4096, rate=48, num_prompts=40960, num_warmups=512,
  random_num_workers=96.

Why mirror those exact knobs: that recipe is what produced the 7p1d
reference numbers we benchmarked against (358K total tok/s, 39.9K output
tok/s, ~5s mean TTFT). Running sa-bench at concurrency=8192/rate=inf
will saturate the 1-decode-worker GPU (we observed 16384 concurrency on
job 617 saturated decode at ~390 running/rank with mean TTFT ~257s,
i.e. equilibrium gated by decode compute, not the bench), making the
result not directly comparable.

Bench framework note: the fzyzcjy fork's benchmark_serving.py /
benchmark_utils.py / encoding_dsv4.py are byte-identical to upstream
SemiAnalysisAI/InferenceX/main; only backend_request_func.py adds five
per-request debug print sites (ok=/lat=/url=/plen=/err=). Throughput
numbers should match sa-bench at the same flags; the fork is chosen
here to keep parity with the reference run's logs.

Skipped on purpose:
- DeepGEMM env knobs (SGLANG_DG_CACHE_DIR / SGLANG_JIT_DEEPGEMM_PRECOMPILE
  vs SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1) — yangminl's cache dir is
  /configs/deepgemm_cache on the gb300-cw host and isn't portable here;
  PR's FAST_WARMUP path stays.
- expert_location_dispatch.py topk_ids int32 cast (yangminl commits
  94b7dc4c7 + e933ef2b1 on the patched sglang fork) — not pulling that
  into the container build.
---
 .../8k1k/disagg-gb300-7p1d-dep4-dep8.yaml     | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
index 9d59cbdc3..1d08a229b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
@@ -182,9 +182,25 @@ backend:
       cuda-graph-max-bs: 1152
 
 benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "8192"
-  req_rate: "inf"
-  use_chat_template: false
+  type: "custom"
+  # Mirror yangminl's gb300-cw 8k1k_hightpt[0] bench (job 564):
+  #   concurrency=4096, rate=48, num-prompts=40960, num-warmups=512,
+  #   random-num-workers=96. Uses upstream SemiAnalysisAI/InferenceX
+  #   benchmark_serving.py at the same flags so this matches the
+  #   reference run's request shape.
+  command: |
+    set -e
+    REPO=/configs/upstream-sa-bench/InferenceX
+    [ -d "$REPO" ] || git clone https://github.com/SemiAnalysisAI/InferenceX.git "$REPO"
+    cd "$REPO/utils/bench_serving"
+    python3 benchmark_serving.py \
+      --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \
+      --host 127.0.0.1 --port 8000 --endpoint /v1/completions \
+      --dataset-name random \
+      --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 \
+      --random-num-workers 96 \
+      --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \
+      --num-warmups 512 \
+      --ignore-eos --trust-remote-code \
+      --percentile-metrics ttft,tpot,itl,e2el \
+      --save-result --result-dir /logs --result-filename results.json

From 22c5e6739040093ad9a8e0a19dd31415600950ff Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 29 Apr 2026 12:25:05 +0800
Subject: [PATCH 46/56] =?UTF-8?q?config(7p1d-dep4-dep8):=20align=20with=20?=
 =?UTF-8?q?job=20564=20=E2=80=94=20multi-frontend,=20sbatch=20dirs,=20name?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Eliminate every non-cluster-specific diff vs job 564's resolved config
(`/outputs/564/config_8k1k_hightpt_0.yaml`):

- name: match `dsv4-pro-gb300-fp4_8k1k_hightpt_0` (was stale gb200 string)
- frontend.enable_multiple_frontends: false → true; add num_additional_frontends: 8
  (job 564 ran 9 dynamo frontends behind nginx; PR was running a single
  frontend, which was a real router-side runtime diff)
- slurm.time_limit: 8h → 3h to match job 564
- sbatch_directives.cpus-per-task: 144, mem: 0 (portable, was missing)
- drop health_check block (job 564 doesn't set it; rely on srtctl default)

Remaining diffs vs job 564 are all either cluster-specific path bindings
(slurm.partition=hpc-mid, frontend.nginx_container, extra_mount of
yangminl's patched sglang) or DG-cache env (SGLANG_DG_CACHE_DIR /
SGLANG_JIT_DEEPGEMM_PRECOMPILE) — those need InferenceX-cluster-side
equivalents and are documented in the header comment.
---
 .../8k1k/disagg-gb300-7p1d-dep4-dep8.yaml     | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
index 1d08a229b..afa4de33f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
@@ -1,4 +1,4 @@
-name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
+name: "dsv4-pro-gb300-fp4_8k1k_hightpt_0"
 
 # 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
 #
@@ -11,13 +11,25 @@ name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
 # wideep [0] override and lifts `benchmark` back out — same operational
 # values, schema the pinned srtctl will accept.
 #
-# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200
-# (matrix runs on gb200-nv runners, not gb300), container & model.path
-# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml
-# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`),
-# slurm.partition + sbatch_directives + extra_mount + nginx_container
-# dropped (they reference paths/partitions that exist only on the PR
-# author's gb300 cluster).
+# Other adjustments back to the InferenceX cluster shape: container &
+# model.path restored to the aliases mapped in launch_gb300.sh's
+# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
+# `deepseek-v4-pro`); `dynamo.install: true` added so the container
+# (which has no dynamo baked in) installs from the pinned hash.
+#
+# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
+#   - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
+#   - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
+#   - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
+#     showed only `expert_location_dispatch.py` topk_ids int32 cast is an
+#     active runtime diff vs container sglang; other patched files are
+#     env-gated dead code under the same SGLANG_OPT_* flags this yaml
+#     already sets.
+#
+# DG-related env intentionally diverged (DG cache path is host-specific):
+#   - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
+#   - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
+#   This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
 
 model:
   path: "deepseek-v4-pro"
@@ -32,7 +44,7 @@ dynamo:
   install: true
 
 slurm:
-  time_limit: "8:00:00"
+  time_limit: "03:00:00"
 
 # Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
 #   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
@@ -46,10 +58,6 @@ sbatch_directives:
   cpus-per-task: "144"
   mem: "0"
 
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
 # Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
 # (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
 resources:
@@ -64,7 +72,8 @@ resources:
 
 frontend:
   type: dynamo
-  enable_multiple_frontends: false
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
 
 backend:
   type: sglang

From 15423f1e56c86bc6d42be584da98ecb05de5543b Mon Sep 17 00:00:00 2001
From: fzyzcjy <ch271828n@outlook.com>
Date: Wed, 29 Apr 2026 12:25:46 +0800
Subject: [PATCH 47/56] config(7p1d-dep4-dep8): keep PR name field, revert to
 original

---
 .../sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
index afa4de33f..fc2a1ef7a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
@@ -1,4 +1,4 @@
-name: "dsv4-pro-gb300-fp4_8k1k_hightpt_0"
+name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
 
 # 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
 #

From a1a6f8d0ff4dce526a0cf0c8d0a0ee28f0d92e35 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Wed, 29 Apr 2026 12:45:06 -0700
Subject: [PATCH 48/56] upd

---
 .github/configs/nvidia-master.yaml            |  71 ++++----
 .../sglang/deepseek-v4/8k1k/conc1.yaml        | 167 ++++++++++++++++++
 ...300-2p1d-dep4-dep8.yaml => conc16384.yaml} |  75 ++++----
 ...b300-7p1d-dep4-dep8.yaml => conc2048.yaml} |  59 +++----
 4 files changed, 262 insertions(+), 110 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-gb300-2p1d-dep4-dep8.yaml => conc16384.yaml} (75%)
 rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-gb300-7p1d-dep4-dep8.yaml => conc2048.yaml} (82%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 7ad8d5864..9a4e8f39b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7736,52 +7736,45 @@ dsv4-fp4-gb300-dynamo-sglang:
   - isl: 8192
     osl: 1024
     search-space:
-    # Low/mid-concurrency entries (1p1d-dep8-tep8 and 3p1d-dep8-dep16
-    # recipes) commented out: PR #1213 only refreshed the 7p1d-dep8-dep16
-    # max-throughput recipe; the 1p1d/3p1d siblings still match the older
-    # operational shape and are out of scope for the PR #1213 sweep.
-    # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes.
-    # - conc-list: [1, 4, 8, 16, 32, 64]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 1
-    #     dp-attn: true
-    # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes.
-    # TEMPORARILY COMMENTED OUT — focusing CI on the conc=8192 7p1d
-    # max-throughput entry only. Re-enable shortly once that's green.
-    # - conc-list: [64]
-    #   prefill:
-    #     num-worker: 2
-    #     tp: 4
-    #     ep: 4
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 8
-    #     dp-attn: true
-    # Max throughput: 7 prefills (TP=4 / DP=4 / EP=4) + 1 decode
-    # (TP=8 / DP=8 / EP=8 wideep). 9 nodes. Refreshed by PR #1213.
-    - conc-list: [8192]
+    # Low concurrency
+    - conc-list: [1]
       prefill:
-        num-worker: 7
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+    # Mid concurrency
+    - conc-list: [2048]
+      prefill:
+        num-worker: 4
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml"
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
+    # Max concurrency
+    - conc-list: [16384]
+      prefill:
+        num-worker: 14
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml
new file mode 100644
index 000000000..1f1649d29
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml
@@ -0,0 +1,167 @@
+name: "conc1"
+
+# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
+#
+# Schema/values come from PR #1213 (513cbef) — that PR introduced the
+# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
+# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
+# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
+# support either: `zip_override_*_hightpt` rejects with `Unknown field`
+# and `benchmark` only validates at top level. So this file inlines the
+# wideep [0] override and lifts `benchmark` back out — same operational
+# values, schema the pinned srtctl will accept.
+#
+# Other adjustments back to the InferenceX cluster shape: container &
+# model.path restored to the aliases mapped in launch_gb300.sh's
+# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
+# `deepseek-v4-pro`); `dynamo.install: true` added so the container
+# (which has no dynamo baked in) installs from the pinned hash.
+#
+# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
+#   - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
+#   - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
+#   - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
+#     showed only `expert_location_dispatch.py` topk_ids int32 cast is an
+#     active runtime diff vs container sglang; other patched files are
+#     env-gated dead code under the same SGLANG_OPT_* flags this yaml
+#     already sets.
+#
+# DG-related env intentionally diverged (DG cache path is host-specific):
+#   - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
+#   - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
+#   This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
+# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
+# dev branch.
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+  install: true
+
+slurm:
+  time_limit: "03:00:00"
+
+# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
+#   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
+#     turns the dynamo `hash:` cold source build (~500 rust crates,
+#     ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144
+#     cargo finishes in ~5 min.
+#   mem=0              — slurm's "give the whole node's memory"; needed
+#     for sglang loading 671B FP4 weights + dynamo build at the same
+#     time without OOM.
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
+
+# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      disable-radix-cache: true
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: mooncake
+
+      tensor-parallel-size: 4
+      data-parallel-size:   1
+      expert-parallel-size: 1
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      disable-radix-cache: true
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: mooncake
+
+      tensor-parallel-size: 4
+      data-parallel-size:   1
+      expert-parallel-size: 1
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+
+      mem-fraction-static: 0.9
+      max-running-requests: 1024
+      cuda-graph-max-bs:    512
+      swa-full-tokens-ratio: 0.1
+      context-length: 16384
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml
similarity index 75%
rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml
index bd5a95715..4d696ae35 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml
@@ -1,4 +1,4 @@
-name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
+name: "conc16384"
 
 # 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
 #
@@ -11,13 +11,25 @@ name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
 # wideep [0] override and lifts `benchmark` back out — same operational
 # values, schema the pinned srtctl will accept.
 #
-# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200
-# (matrix runs on gb200-nv runners, not gb300), container & model.path
-# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml
-# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`),
-# slurm.partition + sbatch_directives + extra_mount + nginx_container
-# dropped (they reference paths/partitions that exist only on the PR
-# author's gb300 cluster).
+# Other adjustments back to the InferenceX cluster shape: container &
+# model.path restored to the aliases mapped in launch_gb300.sh's
+# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
+# `deepseek-v4-pro`); `dynamo.install: true` added so the container
+# (which has no dynamo baked in) installs from the pinned hash.
+#
+# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
+#   - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
+#   - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
+#   - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
+#     showed only `expert_location_dispatch.py` topk_ids int32 cast is an
+#     active runtime diff vs container sglang; other patched files are
+#     env-gated dead code under the same SGLANG_OPT_* flags this yaml
+#     already sets.
+#
+# DG-related env intentionally diverged (DG cache path is host-specific):
+#   - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
+#   - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
+#   This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
 
 model:
   path: "deepseek-v4-pro"
@@ -32,7 +44,7 @@ dynamo:
   install: true
 
 slurm:
-  time_limit: "8:00:00"
+  time_limit: "03:00:00"
 
 # Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
 #   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
@@ -46,25 +58,22 @@ sbatch_directives:
   cpus-per-task: "144"
   mem: "0"
 
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-# Topology: 2 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
-# (TP=8 / DP=8 / EP=8 / 2 nodes). 3 nodes total.
+# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
 resources:
   gpu_type: "gb300"
   gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 2
-  decode_workers: 1
+  prefill_nodes: 14
+  prefill_workers: 14
   gpus_per_prefill: 4
-  gpus_per_decode: 8
+  decode_nodes: 4
+  decode_workers: 1
+  gpus_per_decode: 16
 
 frontend:
   type: dynamo
-  enable_multiple_frontends: false
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
 
 backend:
   type: sglang
@@ -136,7 +145,7 @@ backend:
       trust-remote-code: true
       watchdog-timeout: 86400
       skip-tokenizer-init: true
-      stream-interval: 30
+      stream-interval: 60
 
       tensor-parallel-size: 4
       data-parallel-size: 4
@@ -159,15 +168,7 @@ backend:
       trust-remote-code: true
       watchdog-timeout: 86400
       skip-tokenizer-init: true
-      stream-interval: 30
-
-      # Wideep decode shape (zip_override [0] from PR #1213, inlined).
-      tensor-parallel-size: 8
-      data-parallel-size: 8
-      expert-parallel-size: 8
-
-      enable-dp-attention: true
-      enable-dp-lm-head: true
+      stream-interval: 60
 
       moe-a2a-backend: "deepep"
       deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
@@ -178,13 +179,21 @@ backend:
       mem-fraction-static: 0.94
       swa-full-tokens-ratio: 0.15
       context-length: 16384
-      max-running-requests: 9216
+      tensor-parallel-size: 16
+      data-parallel-size: 16
+      expert-parallel-size: 16
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-a2a-backend: deepep
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+      max-running-requests: 18432
       cuda-graph-max-bs: 1152
 
+
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "64"
+  concurrencies: "16384"
   req_rate: "inf"
   use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml
similarity index 82%
rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml
index fc2a1ef7a..72b8babf5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml
@@ -1,4 +1,4 @@
-name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16"
+name: "conc2048"
 
 # 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
 #
@@ -63,11 +63,11 @@ sbatch_directives:
 resources:
   gpu_type: "gb300"
   gpus_per_node: 4
-  prefill_nodes: 7
+  prefill_nodes: 4
+  prefill_workers: 4
+  gpus_per_prefill: 4
   decode_nodes: 2
-  prefill_workers: 7
   decode_workers: 1
-  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -145,7 +145,7 @@ backend:
       trust-remote-code: true
       watchdog-timeout: 86400
       skip-tokenizer-init: true
-      stream-interval: 30
+      stream-interval: 60
 
       tensor-parallel-size: 4
       data-parallel-size: 4
@@ -168,15 +168,7 @@ backend:
       trust-remote-code: true
       watchdog-timeout: 86400
       skip-tokenizer-init: true
-      stream-interval: 30
-
-      # Wideep decode shape (zip_override [0] from PR #1213, inlined).
-      tensor-parallel-size: 8
-      data-parallel-size: 8
-      expert-parallel-size: 8
-
-      enable-dp-attention: true
-      enable-dp-lm-head: true
+      stream-interval: 60
 
       moe-a2a-backend: "deepep"
       deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
@@ -187,29 +179,20 @@ backend:
       mem-fraction-static: 0.94
       swa-full-tokens-ratio: 0.15
       context-length: 16384
-      max-running-requests: 9216
-      cuda-graph-max-bs: 1152
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-a2a-backend: deepep
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+      max-running-requests: 3072
+      cuda-graph-max-bs: 512
 
 benchmark:
-  type: "custom"
-  # Mirror yangminl's gb300-cw 8k1k_hightpt[0] bench (job 564):
-  #   concurrency=4096, rate=48, num-prompts=40960, num-warmups=512,
-  #   random-num-workers=96. Uses upstream SemiAnalysisAI/InferenceX
-  #   benchmark_serving.py at the same flags so this matches the
-  #   reference run's request shape.
-  command: |
-    set -e
-    REPO=/configs/upstream-sa-bench/InferenceX
-    [ -d "$REPO" ] || git clone https://github.com/SemiAnalysisAI/InferenceX.git "$REPO"
-    cd "$REPO/utils/bench_serving"
-    python3 benchmark_serving.py \
-      --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \
-      --host 127.0.0.1 --port 8000 --endpoint /v1/completions \
-      --dataset-name random \
-      --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 \
-      --random-num-workers 96 \
-      --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \
-      --num-warmups 512 \
-      --ignore-eos --trust-remote-code \
-      --percentile-metrics ttft,tpot,itl,e2el \
-      --save-result --result-dir /logs --result-filename results.json
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: "inf"
+  use_chat_template: false

From b146b86cbc8daae4a68dfada183fb617edae34d0 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Wed, 29 Apr 2026 12:59:35 -0700
Subject: [PATCH 49/56] fix

---
 perf-changelog.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1dd575b18..4c447eaf6 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1971,6 +1971,13 @@
     - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204
 
+- config-keys:
+    - minimaxm2.5-fp4-mi355x-atom
+  description:
+    - "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)"
+    - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042
+
 - config-keys:
     - dsv4-fp4-gb200-dynamo-vllm
   description:
@@ -1994,3 +2001,10 @@
     - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd"
     - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Add conc=8192 recipe for 1k1k: deepep mega_moe backend with cuda-graph-max-bs 1088, max-running-requests 8192, mem-fraction-static 0.80, swa-full-tokens-ratio 0.3, tokenizer-worker-num 16"
+    - "conc=8192 enables SGLANG_OPT_USE_ONLINE_COMPRESS=1 and --stream-interval 30"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1209

From c843c0df725f9c5e7c1682c69f265b462afd6673 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Wed, 29 Apr 2026 13:07:16 -0700
Subject: [PATCH 50/56] fix

---
 perf-changelog.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a4515e784..e0546789e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1961,7 +1961,7 @@
     - "Gate --moe-backend deep_gemm_mega_moe and --gpu-memory-utilization 0.85 on DP_ATTENTION=true per the v0.20.0 recipe"
     - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204
-
+  
 - config-keys:
     - minimaxm2.5-fp4-mi355x-atom
   description:
@@ -1969,7 +1969,7 @@
     - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042
 
-- config-keys:
+- config-keys:  
     - dsv4-fp4-gb200-dynamo-vllm
   description:
     - "DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/vllm-gb200-v0.20.0"

From 927edfebeb94c9487685cd60a6278a8fa8b630fb Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Wed, 29 Apr 2026 15:40:19 -0700
Subject: [PATCH 51/56] middle

---
 .github/configs/nvidia-master.yaml            |  84 ++++++--
 .../sglang/deepseek-v4/8k1k/conc1024.yaml     | 198 ++++++++++++++++++
 .../sglang/deepseek-v4/8k1k/conc256-dp.yaml   | 198 ++++++++++++++++++
 .../sglang/deepseek-v4/8k1k/conc256.yaml      | 167 +++++++++++++++
 .../sglang/deepseek-v4/8k1k/conc512.yaml      | 198 ++++++++++++++++++
 5 files changed, 831 insertions(+), 14 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0f0653ad3..af88972ea 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7737,45 +7737,101 @@ dsv4-fp4-gb300-dynamo-sglang:
   - isl: 8192
     osl: 1024
     search-space:
-    # Low concurrency
-    - conc-list: [1]
+    # Low-latency wideTP decode (no DP-attn): 1p1d, TP=4 prefill / TP=8 decode. 3 nodes.
+    - conc-list: [256]
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml"
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc256.yaml"
       decode:
         num-worker: 1
-        tp: 4
+        tp: 8
         ep: 1
         dp-attn: false
-    # Mid concurrency
-    - conc-list: [2048]
+    # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes.
+    - conc-list: [256]
       prefill:
-        num-worker: 4
+        num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml"
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml"
       decode:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
-    # Max concurrency
-    - conc-list: [16384]
+    # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes.
+    - conc-list: [512]
       prefill:
-        num-worker: 14
+        num-worker: 1
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml"
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc512.yaml"
       decode:
         num-worker: 1
-        tp: 16
-        ep: 16
+        tp: 8
+        ep: 8
         dp-attn: true
+    # DP-attn wideep: 2p1d-dep4-dep8. 4 nodes.
+    - conc-list: [1024]
+      prefill:
+        num-worker: 2
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1024.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # # Low concurrency
+    # - conc-list: [1]
+    #   prefill:
+    #     num-worker: 1
+    #     tp: 4
+    #     ep: 1
+    #     dp-attn: false
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 4
+    #     ep: 1
+    #     dp-attn: false
+    # # Mid concurrency
+    # - conc-list: [2048]
+    #   prefill:
+    #     num-worker: 4
+    #     tp: 4
+    #     ep: 4
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 8
+    #     ep: 8
+    #     dp-attn: true
+    # # Max concurrency
+    # - conc-list: [16384]
+    #   prefill:
+    #     num-worker: 14
+    #     tp: 4
+    #     ep: 4
+    #     dp-attn: true
+    #     additional-settings:
+    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml"
+    #   decode:
+    #     num-worker: 1
+    #     tp: 16
+    #     ep: 16
+    #     dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml
new file mode 100644
index 000000000..d1f6aa2bf
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml
@@ -0,0 +1,198 @@
+name: "conc1024"
+
+# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
+#
+# Schema/values come from PR #1213 (513cbef) — that PR introduced the
+# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
+# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
+# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
+# support either: `zip_override_*_hightpt` rejects with `Unknown field`
+# and `benchmark` only validates at top level. So this file inlines the
+# wideep [0] override and lifts `benchmark` back out — same operational
+# values, schema the pinned srtctl will accept.
+#
+# Other adjustments back to the InferenceX cluster shape: container &
+# model.path restored to the aliases mapped in launch_gb300.sh's
+# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
+# `deepseek-v4-pro`); `dynamo.install: true` added so the container
+# (which has no dynamo baked in) installs from the pinned hash.
+#
+# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
+#   - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
+#   - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
+#   - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
+#     showed only `expert_location_dispatch.py` topk_ids int32 cast is an
+#     active runtime diff vs container sglang; other patched files are
+#     env-gated dead code under the same SGLANG_OPT_* flags this yaml
+#     already sets.
+#
+# DG-related env intentionally diverged (DG cache path is host-specific):
+#   - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
+#   - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
+#   This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
+# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
+# dev branch.
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+  install: true
+
+slurm:
+  time_limit: "03:00:00"
+
+# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
+#   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
+#     turns the dynamo `hash:` cold source build (~500 rust crates,
+#     ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144
+#     cargo finishes in ~5 min.
+#   mem=0              — slurm's "give the whole node's memory"; needed
+#     for sglang loading 671B FP4 weights + dynamo build at the same
+#     time without OOM.
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
+
+# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  prefill_workers: 2
+  gpus_per_prefill: 4
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 60
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 60
+
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.94
+      swa-full-tokens-ratio: 0.15
+      context-length: 16384
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-a2a-backend: deepep
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+      max-running-requests: 3072
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml
new file mode 100644
index 000000000..eac786947
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml
@@ -0,0 +1,198 @@
+name: "conc256-dp"
+
+# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
+#
+# Schema/values come from PR #1213 (513cbef) — that PR introduced the
+# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
+# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
+# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
+# support either: `zip_override_*_hightpt` rejects with `Unknown field`
+# and `benchmark` only validates at top level. So this file inlines the
+# wideep [0] override and lifts `benchmark` back out — same operational
+# values, schema the pinned srtctl will accept.
+#
+# Other adjustments back to the InferenceX cluster shape: container &
+# model.path restored to the aliases mapped in launch_gb300.sh's
+# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
+# `deepseek-v4-pro`); `dynamo.install: true` added so the container
+# (which has no dynamo baked in) installs from the pinned hash.
+#
+# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
+#   - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
+#   - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
+#   - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
+#     showed only `expert_location_dispatch.py` topk_ids int32 cast is an
+#     active runtime diff vs container sglang; other patched files are
+#     env-gated dead code under the same SGLANG_OPT_* flags this yaml
+#     already sets.
+#
+# DG-related env intentionally diverged (DG cache path is host-specific):
+#   - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
+#   - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
+#   This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
+# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
+# dev branch.
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+  install: true
+
+slurm:
+  time_limit: "03:00:00"
+
+# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
+#   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
+#     turns the dynamo `hash:` cold source build (~500 rust crates,
+#     ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144
+#     cargo finishes in ~5 min.
+#   mem=0              — slurm's "give the whole node's memory"; needed
+#     for sglang loading 671B FP4 weights + dynamo build at the same
+#     time without OOM.
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
+
+# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 60
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 60
+
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.94
+      swa-full-tokens-ratio: 0.15
+      context-length: 16384
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-a2a-backend: deepep
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+      max-running-requests: 3072
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml
new file mode 100644
index 000000000..ff628d272
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml
@@ -0,0 +1,167 @@
+name: "conc256"
+
+# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
+#
+# Schema/values come from PR #1213 (513cbef) — that PR introduced the
+# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
+# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
+# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
+# support either: `zip_override_*_hightpt` rejects with `Unknown field`
+# and `benchmark` only validates at top level. So this file inlines the
+# wideep [0] override and lifts `benchmark` back out — same operational
+# values, schema the pinned srtctl will accept.
+#
+# Other adjustments back to the InferenceX cluster shape: container &
+# model.path restored to the aliases mapped in launch_gb300.sh's
+# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
+# `deepseek-v4-pro`); `dynamo.install: true` added so the container
+# (which has no dynamo baked in) installs from the pinned hash.
+#
+# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
+#   - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
+#   - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
+#   - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
+#     showed only `expert_location_dispatch.py` topk_ids int32 cast is an
+#     active runtime diff vs container sglang; other patched files are
+#     env-gated dead code under the same SGLANG_OPT_* flags this yaml
+#     already sets.
+#
+# DG-related env intentionally diverged (DG cache path is host-specific):
+#   - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
+#   - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
+#   This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
+# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
+# dev branch.
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+  install: true
+
+slurm:
+  time_limit: "03:00:00"
+
+# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
+#   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
+#     turns the dynamo `hash:` cold source build (~500 rust crates,
+#     ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144
+#     cargo finishes in ~5 min.
+#   mem=0              — slurm's "give the whole node's memory"; needed
+#     for sglang loading 671B FP4 weights + dynamo build at the same
+#     time without OOM.
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
+
+# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      disable-radix-cache: true
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: mooncake
+
+      tensor-parallel-size: 4
+      data-parallel-size:   1
+      expert-parallel-size: 1
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      disable-radix-cache: true
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: mooncake
+
+      tensor-parallel-size: 8
+      data-parallel-size:   1
+      expert-parallel-size: 1
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+
+      mem-fraction-static: 0.9
+      max-running-requests: 1024
+      cuda-graph-max-bs:    512
+      swa-full-tokens-ratio: 0.1
+      context-length: 16384
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "16"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml
new file mode 100644
index 000000000..71cfa4bc3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml
@@ -0,0 +1,198 @@
+name: "conc512"
+
+# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
+#
+# Schema/values come from PR #1213 (513cbef) — that PR introduced the
+# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
+# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
+# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
+# support either: `zip_override_*_hightpt` rejects with `Unknown field`
+# and `benchmark` only validates at top level. So this file inlines the
+# wideep [0] override and lifts `benchmark` back out — same operational
+# values, schema the pinned srtctl will accept.
+#
+# Other adjustments back to the InferenceX cluster shape: container &
+# model.path restored to the aliases mapped in launch_gb300.sh's
+# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
+# `deepseek-v4-pro`); `dynamo.install: true` added so the container
+# (which has no dynamo baked in) installs from the pinned hash.
+#
+# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
+#   - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
+#   - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
+#   - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
+#     showed only `expert_location_dispatch.py` topk_ids int32 cast is an
+#     active runtime diff vs container sglang; other patched files are
+#     env-gated dead code under the same SGLANG_OPT_* flags this yaml
+#     already sets.
+#
+# DG-related env intentionally diverged (DG cache path is host-specific):
+#   - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
+#   - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
+#   This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
+  precision: "fp4"
+
+# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
+# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
+# dev branch.
+dynamo:
+  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
+  install: true
+
+slurm:
+  time_limit: "03:00:00"
+
+# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
+#   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
+#     turns the dynamo `hash:` cold source build (~500 rust crates,
+#     ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144
+#     cargo finishes in ~5 min.
+#   mem=0              — slurm's "give the whole node's memory"; needed
+#     for sglang loading 671B FP4 weights + dynamo build at the same
+#     time without OOM.
+sbatch_directives:
+  cpus-per-task: "144"
+  mem: "0"
+
+# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
+# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+  decode_nodes: 2
+  decode_workers: 1
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 8
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_ENABLE_THINKING: "1"
+    SGLANG_REASONING_EFFORT: "max"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_LOG_FORWARD_ITERS: "1"
+    SGLANG_LOG_MS: "1"
+    SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60"
+    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+    # is single-node only and corrupts results in 2-node decode setups.
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 60
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      expert-parallel-size: 4
+
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.90
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+      chunked-prefill-size: 32768
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      trust-remote-code: true
+      watchdog-timeout: 86400
+      skip-tokenizer-init: true
+      stream-interval: 60
+
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: mooncake
+
+      mem-fraction-static: 0.94
+      swa-full-tokens-ratio: 0.15
+      context-length: 16384
+      tensor-parallel-size: 8
+      data-parallel-size: 8
+      expert-parallel-size: 8
+      enable-dp-attention: true
+      enable-dp-lm-head: true
+      moe-a2a-backend: deepep
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+      max-running-requests: 3072
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512"
+  req_rate: "inf"
+  use_chat_template: false

From c14d06dd75a81a1f02439b41f1885165916abb06 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Wed, 29 Apr 2026 15:42:12 -0700
Subject: [PATCH 52/56] fi

---
 .../srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml
index ff628d272..a4460a3c5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml
@@ -68,7 +68,7 @@ resources:
   gpus_per_prefill: 4
   decode_nodes: 2
   decode_workers: 1
-  gpus_per_decode: 4
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -162,6 +162,6 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "16"
+  concurrencies: "256"
   req_rate: "inf"
   use_chat_template: false

From 5e86ffcf98dd894729acbfa64e2834445e823c4e Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Wed, 29 Apr 2026 15:49:27 -0700
Subject: [PATCH 53/56] fix

---
 perf-changelog.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c0c5b5e4c..f27d33ea6 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1961,14 +1961,14 @@
     - "Gate --moe-backend deep_gemm_mega_moe and --gpu-memory-utilization 0.85 on DP_ATTENTION=true per the v0.20.0 recipe"
     - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204
-  
+
 - config-keys:
     - minimaxm2.5-fp4-mi355x-atom
   description:
     - "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)"
     - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042
-
+ 
 - config-keys:  
     - dsv4-fp4-gb200-dynamo-vllm
   description:

From 5776fd5a969bed8cc1f08d238437c549859bf0bc Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Wed, 29 Apr 2026 17:16:58 -0700
Subject: [PATCH 54/56] upd

---
 .github/configs/nvidia-master.yaml            |  20 +--
 .../sglang/deepseek-v4/8k1k/conc256.yaml      | 167 ------------------
 .../8k1k/{conc256-dp.yaml => conc512-20.yaml} |  14 +-
 3 files changed, 10 insertions(+), 191 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{conc256-dp.yaml => conc512-20.yaml} (97%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index b7f0c607b..155f1b7c0 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7738,34 +7738,20 @@ dsv4-fp4-gb300-dynamo-sglang:
     osl: 1024
     search-space:
     # Low-latency wideTP decode (no DP-attn): 1p1d, TP=4 prefill / TP=8 decode. 3 nodes.
-    - conc-list: [256]
+    - conc-list: [512]
       prefill:
         num-worker: 1
         tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc256.yaml"
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml"
       decode:
         num-worker: 1
-        tp: 8
+        tp: 16
         ep: 1
         dp-attn: false
     # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes.
-    - conc-list: [256]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-    # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes.
     - conc-list: [512]
       prefill:
         num-worker: 1
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml
deleted file mode 100644
index a4460a3c5..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml
+++ /dev/null
@@ -1,167 +0,0 @@
-name: "conc256"
-
-# 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
-#
-# Schema/values come from PR #1213 (513cbef) — that PR introduced the
-# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override`
-# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our
-# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't
-# support either: `zip_override_*_hightpt` rejects with `Unknown field`
-# and `benchmark` only validates at top level. So this file inlines the
-# wideep [0] override and lifts `benchmark` back out — same operational
-# values, schema the pinned srtctl will accept.
-#
-# Other adjustments back to the InferenceX cluster shape: container &
-# model.path restored to the aliases mapped in launch_gb300.sh's
-# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and
-# `deepseek-v4-pro`); `dynamo.install: true` added so the container
-# (which has no dynamo baked in) installs from the pinned hash.
-#
-# Cluster-specific items NOT inlined (require InferenceX-side equivalents):
-#   - slurm.partition (yangminl's gb300-cw uses `hpc-mid`)
-#   - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path)
-#   - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis
-#     showed only `expert_location_dispatch.py` topk_ids int32 cast is an
-#     active runtime diff vs container sglang; other patched files are
-#     env-gated dead code under the same SGLANG_OPT_* flags this yaml
-#     already sets.
-#
-# DG-related env intentionally diverged (DG cache path is host-specific):
-#   - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host)
-#   - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache)
-#   This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
-  precision: "fp4"
-
-# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin
-# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4
-# dev branch.
-dynamo:
-  hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"
-  install: true
-
-slurm:
-  time_limit: "03:00:00"
-
-# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster:
-#   cpus-per-task=144  — without this slurm hands out 1 CPU/task, which
-#     turns the dynamo `hash:` cold source build (~500 rust crates,
-#     ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144
-#     cargo finishes in ~5 min.
-#   mem=0              — slurm's "give the whole node's memory"; needed
-#     for sglang loading 671B FP4 weights + dynamo build at the same
-#     time without OOM.
-sbatch_directives:
-  cpus-per-task: "144"
-  mem: "0"
-
-# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode
-# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total.
-resources:
-  gpu_type: "gb300"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  prefill_workers: 1
-  gpus_per_prefill: 4
-  decode_nodes: 2
-  decode_workers: 1
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: true
-  num_additional_frontends: 8
-
-backend:
-  type: sglang
-
-  prefill_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
-    SGLANG_ENABLE_THINKING: "1"
-    SGLANG_REASONING_EFFORT: "max"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
-    MC_FORCE_MNNVL: "1"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
-
-  decode_environment:
-    PYTHONUNBUFFERED: "1"
-    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
-    SGLANG_ENABLE_THINKING: "1"
-    SGLANG_REASONING_EFFORT: "max"
-    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
-    SGLANG_OPT_USE_JIT_NORM: "1"
-    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
-    SGLANG_OPT_USE_TOPK_V2: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
-    MC_FORCE_MNNVL: "1"
-    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
-    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
-    # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
-    # is single-node only and corrupts results in 2-node decode setups.
-
-  sglang_config:
-    prefill:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
-      trust-remote-code: true
-      disable-radix-cache: true
-
-      disaggregation-mode: "prefill"
-      disaggregation-transfer-backend: mooncake
-
-      tensor-parallel-size: 4
-      data-parallel-size:   1
-      expert-parallel-size: 1
-
-      moe-runner-backend: "flashinfer_mxfp4"
-      disable-flashinfer-autotune: true
-
-      mem-fraction-static: 0.90
-      max-running-requests: 512
-      cuda-graph-max-bs: 512
-      chunked-prefill-size: 32768
-
-    decode:
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      model-path: "/model/"
-      trust-remote-code: true
-      disable-radix-cache: true
-
-      disaggregation-mode: "decode"
-      disaggregation-transfer-backend: mooncake
-
-      tensor-parallel-size: 8
-      data-parallel-size:   1
-      expert-parallel-size: 1
-
-      moe-runner-backend: "flashinfer_mxfp4"
-      disable-flashinfer-autotune: true
-
-      mem-fraction-static: 0.9
-      max-running-requests: 1024
-      cuda-graph-max-bs:    512
-      swa-full-tokens-ratio: 0.1
-      context-length: 16384
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "256"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml
similarity index 97%
rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml
index eac786947..526aa8636 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml
@@ -1,4 +1,4 @@
-name: "conc256-dp"
+name: "conc512"
 
 # 8k/1k high-throughput topology for the wideep DSV4-Pro setup.
 #
@@ -66,9 +66,9 @@ resources:
   prefill_nodes: 1
   prefill_workers: 1
   gpus_per_prefill: 4
-  decode_nodes: 2
+  decode_nodes: 4
   decode_workers: 1
-  gpus_per_decode: 8
+  gpus_per_decode: 16
 
 frontend:
   type: dynamo
@@ -179,9 +179,9 @@ backend:
       mem-fraction-static: 0.94
       swa-full-tokens-ratio: 0.15
       context-length: 16384
-      tensor-parallel-size: 8
-      data-parallel-size: 8
-      expert-parallel-size: 8
+      tensor-parallel-size: 16
+      data-parallel-size: 16
+      expert-parallel-size: 16
       enable-dp-attention: true
       enable-dp-lm-head: true
       moe-a2a-backend: deepep
@@ -193,6 +193,6 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "256"
+  concurrencies: "512"
   req_rate: "inf"
   use_chat_template: false

From fce13d0546350755ec87743b6b34450ff32bb766 Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Wed, 29 Apr 2026 17:19:46 -0700
Subject: [PATCH 55/56] fix

---
 .github/configs/nvidia-master.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f5da3cdbf..c8d6834ca 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7762,20 +7762,20 @@ dsv4-fp4-gb300-dynamo-sglang:
   - isl: 8192
     osl: 1024
     search-space:
-    # Low-latency wideTP decode (no DP-attn): 1p1d, TP=4 prefill / TP=8 decode. 3 nodes.
+    # WideEP TP=16 decode: 1p1d-dep4-dep16. 5 nodes (4P + 16D = 20 GPUs).
     - conc-list: [512]
       prefill:
         num-worker: 1
         tp: 4
-        ep: 1
-        dp-attn: false
+        ep: 4
+        dp-attn: true
         additional-settings:
         - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml"
       decode:
         num-worker: 1
         tp: 16
-        ep: 1
-        dp-attn: false
+        ep: 16
+        dp-attn: true
     # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes.
     - conc-list: [512]
       prefill:

From 484763a7fdf5b7472ce57802a7952d7b81cf5ece Mon Sep 17 00:00:00 2001
From: Cheng Wan <chwan@rice.edu>
Date: Wed, 29 Apr 2026 18:55:33 -0700
Subject: [PATCH 56/56] upd

---
 .github/configs/nvidia-master.yaml | 84 +++++++++++++++---------------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c8d6834ca..aff249a8b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7804,45 +7804,45 @@ dsv4-fp4-gb300-dynamo-sglang:
         tp: 8
         ep: 8
         dp-attn: true
-    # # Low concurrency
-    # - conc-list: [1]
-    #   prefill:
-    #     num-worker: 1
-    #     tp: 4
-    #     ep: 1
-    #     dp-attn: false
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 4
-    #     ep: 1
-    #     dp-attn: false
-    # # Mid concurrency
-    # - conc-list: [2048]
-    #   prefill:
-    #     num-worker: 4
-    #     tp: 4
-    #     ep: 4
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 8
-    #     ep: 8
-    #     dp-attn: true
-    # # Max concurrency
-    # - conc-list: [16384]
-    #   prefill:
-    #     num-worker: 14
-    #     tp: 4
-    #     ep: 4
-    #     dp-attn: true
-    #     additional-settings:
-    #     - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml"
-    #   decode:
-    #     num-worker: 1
-    #     tp: 16
-    #     ep: 16
-    #     dp-attn: true
+    # Low concurrency
+    - conc-list: [1]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml"
+      decode:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+    # Mid concurrency
+    - conc-list: [2048]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # Max concurrency
+    - conc-list: [16384]
+      prefill:
+        num-worker: 14
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true