From dbb221882dd76eb8e7662a51cab87544ad3796b2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 23 Apr 2026 21:12:11 -0700
Subject: [PATCH 01/28] Add DeepSeek V4 Flash FP4 GB200 disaggregated vLLM
 benchmarks via Dynamo

Adapts the kimi-k2.5 dynamo-vllm 8k1k 5p1d-dep4-dep8 recipe to DeepSeek V4
Flash. Recipes live under srt-slurm-recipes/ and are copied into the
srt-slurm checkout at runtime since the upstream NVIDIA/srt-slurm repo
doesn't ship DSV4 recipes.

- New config key: dsv4-fp4-gb200-dynamo-vllm
- Image: vllm/vllm-openai:deepseekv4-cu130
- Model: deepseek-ai/DeepSeek-V4-Flash
- Model path on cluster: /mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash
---
 .github/configs/nvidia-master.yaml            |  27 +++++
 perf-changelog.yaml                           |   8 ++
 runners/launch_gb200-nv.sh                    |  12 ++-
 .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml     | 101 ++++++++++++++++++
 4 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ec9cbc11e..76da9e7d7 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7428,3 +7428,30 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+
+dsv4-fp4-gb200-dynamo-vllm:
+  image: vllm/vllm-openai:deepseekv4-cu130
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - conc-list: [2048]
+      prefill:
+        num-worker: 5
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index ddc6409c2..1445ad3c7 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,11 @@
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "Add DeepSeek V4 Flash FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 5p1d)"
+    - "Container: vllm/vllm-openai:deepseekv4-cu130"
+    - "Recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
+
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
     - dsr1-fp8-h100-dynamo-sglang
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index b746e4a24..cb0396421 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -42,8 +42,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     if [[ $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
         export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
+    elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+        export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash"
+        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash"
     else
-        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4"
+        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4"
         exit 1
     fi
 else
@@ -134,7 +137,12 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-if [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    git checkout sa-submission-q2-2026
+    cp -r "$GITHUB_WORKSPACE/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
+elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q2-2026
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
new file mode 100644
index 000000000..6f5db6e49
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -0,0 +1,101 @@
+name: "dsv4-vllm-disagg-gb200-5p1d-dep4-dep8"
+
+model:
+  path: "deepseek-v4-flash"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  version: 1.0.1
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 5
+  decode_nodes: 2
+  prefill_workers: 5
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  decode_environment:
+    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Flash"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 64
+      enforce-eager: true
+      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      max-num-batched-tokens: 16384
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      gpu-memory-utilization: 0.9
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Flash"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 10240
+      max-num-seqs: 512
+      max-num-batched-tokens: 10240
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-chunked-prefill: true
+      async-scheduling: true
+      attention-backend: "FLASHINFER_MLA"
+      block-size: 64
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      max-cudagraph-capture-size: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: "inf"

From 1bb849472caed1897592777ae7ff09dd5bd0fdb1 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 23 Apr 2026 21:24:46 -0700
Subject: [PATCH 02/28] flags

---
 .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml     | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
index 6f5db6e49..21fe86970 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -55,18 +55,21 @@ backend:
       enable-expert-parallel: true
       max-model-len: 10240
       max-num-seqs: 64
-      enforce-eager: true
-      compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
       max-num-batched-tokens: 16384
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-chunked-prefill: true
       attention-backend: "FLASHINFER_MLA"
-      block-size: 64
-      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}'
+      block-size: 256
+      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true, "use_fp4_indexer_cache": true}'
       all2all-backend: "flashinfer_nvlink_one_sided"
       gpu-memory-utilization: 0.9
+      tokenizer-mode: "deepseek_v4"
+      tool-call-parser: "deepseek_v4"
+      enable-auto-tool-choice: true
+      reasoning-parser: "deepseek_v4"
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -86,12 +89,17 @@ backend:
       no-enable-chunked-prefill: true
       async-scheduling: true
       attention-backend: "FLASHINFER_MLA"
-      block-size: 64
+      block-size: 256
+      attention-config: '{"use_fp4_indexer_cache": true}'
       all2all-backend: "flashinfer_nvlink_one_sided"
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}'
+      compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
       gpu-memory-utilization: 0.9
       stream-interval: 50
       max-cudagraph-capture-size: 512
+      tokenizer-mode: "deepseek_v4"
+      tool-call-parser: "deepseek_v4"
+      enable-auto-tool-choice: true
+      reasoning-parser: "deepseek_v4"
 
 benchmark:
   type: "sa-bench"

From 41e71b833c712b4dcc392e3f096c054d77bf86a4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 23 Apr 2026 21:27:58 -0700
Subject: [PATCH 03/28] import

---
 .../vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
index 21fe86970..a98e63480 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -6,7 +6,10 @@ model:
   precision: "fp4"
 
 dynamo:
-  version: 1.0.1
+  # Source install pinned to the first commit that fixes the vllm.inputs restructure.
+  # v1.0.1 / v1.0.2 / v1.1.0-dev.* on PyPI still import from vllm.inputs.data, which
+  # was removed in the vLLM build inside vllm/vllm-openai:deepseekv4-cu130.
+  hash: d5803cbe71c0035a725652373a175f01942c4a33
   install: true
 
 setup_script: vllm-container-deps.sh

From 4854a7a0db0f10c2dfbd906dd382ab48d03f4a04 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 23 Apr 2026 21:42:03 -0700
Subject: [PATCH 04/28] flags

---
 .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml        | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
index a98e63480..18450501a 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -27,6 +27,16 @@ resources:
 frontend:
   type: dynamo
   enable_multiple_frontends: false
+  # --dyn-chat-processor vllm routes OpenAI pre/post-processing through vLLM's
+  # FrontendArgs, which is what recognises --tool-call-parser,
+  # --enable-auto-tool-choice, and --reasoning-parser. In a Dynamo disagg split
+  # these are server-layer concerns: the workers (dynamo.vllm) only accept
+  # AsyncEngineArgs.
+  args:
+    dyn-chat-processor: "vllm"
+    tool-call-parser: "deepseek_v4"
+    enable-auto-tool-choice: true
+    reasoning-parser: "deepseek_v4"
 
 backend:
   type: vllm
@@ -70,9 +80,6 @@ backend:
       all2all-backend: "flashinfer_nvlink_one_sided"
       gpu-memory-utilization: 0.9
       tokenizer-mode: "deepseek_v4"
-      tool-call-parser: "deepseek_v4"
-      enable-auto-tool-choice: true
-      reasoning-parser: "deepseek_v4"
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -100,9 +107,6 @@ backend:
       stream-interval: 50
       max-cudagraph-capture-size: 512
       tokenizer-mode: "deepseek_v4"
-      tool-call-parser: "deepseek_v4"
-      enable-auto-tool-choice: true
-      reasoning-parser: "deepseek_v4"
 
 benchmark:
   type: "sa-bench"

From ac030e6d315a073db0f369f28633875b6211123c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 23 Apr 2026 22:02:12 -0700
Subject: [PATCH 05/28] recipe change

---
 .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml     | 62 ++++++++-----------
 1 file changed, 25 insertions(+), 37 deletions(-)

diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
index 18450501a..9d727f400 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -1,14 +1,19 @@
 name: "dsv4-vllm-disagg-gb200-5p1d-dep4-dep8"
 
+# Adapted from NVIDIA/srt-slurm PR #67 (deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16).
+# Changes:
+#   * DeepSeek-V4-Flash instead of Pro (smaller model, same arch)
+#   * 5p1d-dep4-dep8 topology instead of 7p1d-dep8-dep16
+#   * dynamo source-install pinned to the vllm.inputs.data fix commit (v1.0.2
+#     on PyPI still imports from vllm.inputs.data, which the vLLM in
+#     vllm/vllm-openai:deepseekv4-cu130 no longer exposes)
+
 model:
   path: "deepseek-v4-flash"
   container: "vllm/vllm-openai:deepseekv4-cu130"
   precision: "fp4"
 
 dynamo:
-  # Source install pinned to the first commit that fixes the vllm.inputs restructure.
-  # v1.0.1 / v1.0.2 / v1.1.0-dev.* on PyPI still import from vllm.inputs.data, which
-  # was removed in the vLLM build inside vllm/vllm-openai:deepseekv4-cu130.
   hash: d5803cbe71c0035a725652373a175f01942c4a33
   install: true
 
@@ -27,34 +32,26 @@ resources:
 frontend:
   type: dynamo
   enable_multiple_frontends: false
-  # --dyn-chat-processor vllm routes OpenAI pre/post-processing through vLLM's
-  # FrontendArgs, which is what recognises --tool-call-parser,
-  # --enable-auto-tool-choice, and --reasoning-parser. In a Dynamo disagg split
-  # these are server-layer concerns: the workers (dynamo.vllm) only accept
-  # AsyncEngineArgs.
-  args:
-    dyn-chat-processor: "vllm"
-    tool-call-parser: "deepseek_v4"
-    enable-auto-tool-choice: true
-    reasoning-parser: "deepseek_v4"
 
 backend:
   type: vllm
   connector: null
 
   prefill_environment:
-    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
 
   decode_environment:
-    VLLM_USE_FLASHINFER_MOE_FP4: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
 
   vllm_config:
     prefill:
@@ -66,20 +63,16 @@ backend:
       data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: 10240
-      max-num-seqs: 64
-      compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
       max-num-batched-tokens: 16384
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      no-enable-chunked-prefill: true
-      attention-backend: "FLASHINFER_MLA"
       block-size: 256
-      attention-config: '{"use_trtllm_ragged_deepseek_prefill": true, "use_fp4_indexer_cache": true}'
-      all2all-backend: "flashinfer_nvlink_one_sided"
-      gpu-memory-utilization: 0.9
-      tokenizer-mode: "deepseek_v4"
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -90,23 +83,18 @@ backend:
       data-parallel-size: 8
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: 10240
-      max-num-seqs: 512
-      max-num-batched-tokens: 10240
-      safetensors-load-strategy: "prefetch"
+      max-model-len: auto
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
       trust-remote-code: true
       no-enable-prefix-caching: true
-      no-enable-chunked-prefill: true
-      async-scheduling: true
-      attention-backend: "FLASHINFER_MLA"
       block-size: 256
-      attention-config: '{"use_fp4_indexer_cache": true}'
-      all2all-backend: "flashinfer_nvlink_one_sided"
-      compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
       gpu-memory-utilization: 0.9
       stream-interval: 50
-      max-cudagraph-capture-size: 512
-      tokenizer-mode: "deepseek_v4"
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
 
 benchmark:
   type: "sa-bench"

From b592c60b2969bfed3e4dcc0a4e2674362cdc81b6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 23 Apr 2026 22:26:30 -0700
Subject: [PATCH 06/28] prompt

---
 .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml     |  2 ++
 .../vllm/deepseek-v4/dsv4-chat-template.jinja | 32 +++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja

diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
index 9d727f400..fa546761c 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -44,6 +44,7 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
+    DYN_CUSTOM_JINJA_TEMPLATE: "/infmax-workspace/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja"
 
   decode_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
@@ -52,6 +53,7 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
+    DYN_CUSTOM_JINJA_TEMPLATE: "/infmax-workspace/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja"
 
   vllm_config:
     prefill:
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja
new file mode 100644
index 000000000..e684deee8
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja
@@ -0,0 +1,32 @@
+{#- Minimal DeepSeek-V4 chat template.
+
+DSV4 does not ship a Jinja chat_template; HuggingFace provides only the
+Python `encoding_dsv4` helper. Dynamo's frontend still requires a
+chat_template at startup (PromptFormatter.from_mdc), so we register this
+file via --custom-jinja-template.
+
+This template is a best-effort DeepSeek-style formatter: delimiters
+mirror DeepSeek-V3 (<|User|>, <|Assistant|>, <|end_of_sentence|>) and it
+renders `reasoning_content` wrapped in <think>...</think> so Dynamo's
+`template_handles_reasoning` detection fires (avoids double-injection).
+
+sa-bench throughput runs use /v1/completions (raw prompts), so this
+template is not exercised during benchmarking. If eval or chat-style
+workloads are added later, replace this with a validated template
+derived from deepseek-ai/DeepSeek-V4-*/encoding_dsv4.py. -#}
+{%- for message in messages -%}
+  {%- if message['role'] == 'system' -%}
+    {{ message['content'] }}
+  {%- elif message['role'] == 'user' -%}
+    <|User|>{{ message['content'] }}
+  {%- elif message['role'] == 'assistant' -%}
+    <|Assistant|>
+    {%- if message.get('reasoning_content') -%}
+      <think>{{ message['reasoning_content'] }}</think>
+    {%- endif -%}
+    {{ message['content'] }}<|end_of_sentence|>
+  {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+<|Assistant|>
+{%- endif -%}

From 11a4c08f78e25716a7f5d12780e63fe52fda4deb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 23 Apr 2026 23:00:36 -0700
Subject: [PATCH 07/28] prompt

---
 .github/configs/nvidia-master.yaml            |  2 +-
 runners/launch_gb200-nv.sh                    |  4 +-
 .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml     | 16 +++--
 .../vllm/deepseek-v4/dsv4-chat-template.jinja | 65 ++++++++++++-------
 4 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 76da9e7d7..a6e0c3ce3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7431,7 +7431,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
 
 dsv4-fp4-gb200-dynamo-vllm:
   image: vllm/vllm-openai:deepseekv4-cu130
-  model: deepseek-ai/DeepSeek-V4-Flash
+  model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb200
   precision: fp4
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index cb0396421..d7d0271e7 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -43,8 +43,8 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
         export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
     elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
-        export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash"
-        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash"
+        export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Pro"
+        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
     else
         echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4"
         exit 1
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
index fa546761c..afba21415 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
@@ -2,14 +2,15 @@ name: "dsv4-vllm-disagg-gb200-5p1d-dep4-dep8"
 
 # Adapted from NVIDIA/srt-slurm PR #67 (deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16).
 # Changes:
-#   * DeepSeek-V4-Flash instead of Pro (smaller model, same arch)
 #   * 5p1d-dep4-dep8 topology instead of 7p1d-dep8-dep16
 #   * dynamo source-install pinned to the vllm.inputs.data fix commit (v1.0.2
 #     on PyPI still imports from vllm.inputs.data, which the vLLM in
 #     vllm/vllm-openai:deepseekv4-cu130 no longer exposes)
+#   * DYN_CUSTOM_JINJA_TEMPLATE points at a derived-from-encoding_dsv4 template
+#     since DSV4 ships no Jinja chat_template
 
 model:
-  path: "deepseek-v4-flash"
+  path: "deepseek-v4-pro"
   container: "vllm/vllm-openai:deepseekv4-cu130"
   precision: "fp4"
 
@@ -58,7 +59,7 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Flash"
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
@@ -78,7 +79,7 @@ backend:
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Flash"
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
@@ -104,3 +105,10 @@ benchmark:
   osl: 1024
   concurrencies: "2048"
   req_rate: "inf"
+  # DSV4's HF tokenizer ships no chat_template (README: "This release does not
+  # include a Jinja-format chat template"). sa-bench's --use-chat-template
+  # path calls tokenizer.apply_chat_template() directly on the HF tokenizer,
+  # which raises ValueError. Send raw random tokens via /v1/completions
+  # instead — correct for throughput benchmarking and matches sa-bench's
+  # warmup path that already succeeded in prior runs.
+  use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja
index e684deee8..b2e8a5f37 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja
+++ b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja
@@ -1,32 +1,53 @@
-{#- Minimal DeepSeek-V4 chat template.
+{#- DeepSeek-V4 chat template.
 
-DSV4 does not ship a Jinja chat_template; HuggingFace provides only the
-Python `encoding_dsv4` helper. Dynamo's frontend still requires a
-chat_template at startup (PromptFormatter.from_mdc), so we register this
-file via --custom-jinja-template.
+Derived from the reference encoding_dsv4 at
+https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/tree/main/encoding
+and README.md (README Quick Start):
 
-This template is a best-effort DeepSeek-style formatter: delimiters
-mirror DeepSeek-V3 (<|User|>, <|Assistant|>, <|end_of_sentence|>) and it
-renders `reasoning_content` wrapped in <think>...</think> so Dynamo's
-`template_handles_reasoning` detection fires (avoids double-injection).
+    <｜begin▁of▁sentence｜>{system}
+    <｜User｜>{message}<｜Assistant｜><think>{reasoning}</think>{response}<｜end▁of▁sentence｜>
 
-sa-bench throughput runs use /v1/completions (raw prompts), so this
-template is not exercised during benchmarking. If eval or chat-style
-workloads are added later, replace this with a validated template
-derived from deepseek-ai/DeepSeek-V4-*/encoding_dsv4.py. -#}
-{%- for message in messages -%}
-  {%- if message['role'] == 'system' -%}
-    {{ message['content'] }}
-  {%- elif message['role'] == 'user' -%}
-    <|User|>{{ message['content'] }}
+Format rules implemented:
+  * BOS <｜begin▁of▁sentence｜> once at the start, immediately followed by
+    the system prompt inline (no role wrapper).
+  * User turn: <｜User｜>{content}
+  * Assistant turn (thinking mode — DSV4 default):
+      <｜Assistant｜><think>{reasoning_content}</think>{content}<｜end▁of▁sentence｜>
+    Assistant turn (chat mode, no reasoning):
+      <｜Assistant｜></think>{content}<｜end▁of▁sentence｜>
+    (chat mode opens an empty thinking block, per the README.)
+  * add_generation_prompt: <｜Assistant｜><think>  (thinking mode default)
+
+Tool calls and the developer / latest_reminder / quick-instruction roles
+from encoding_dsv4.py are NOT implemented here. sa-bench throughput runs
+use /v1/completions so this template is only evaluated at frontend
+startup (PromptFormatter.from_mdc); it is not invoked per-request. If
+eval via /v1/chat/completions is added, expand this template to match
+encoding_dsv4.py (DSML tool-call format, drop_thinking semantics, etc.).
+-#}
+{%- if messages and messages[0]['role'] == 'system' -%}
+  {%- set system_content = messages[0]['content'] -%}
+  {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+  {%- set system_content = '' -%}
+  {%- set loop_messages = messages -%}
+{%- endif -%}
+<｜begin▁of▁sentence｜>{{ system_content }}
+{%- for message in loop_messages -%}
+  {%- if message['role'] == 'user' -%}
+<｜User｜>{{ message['content'] }}
   {%- elif message['role'] == 'assistant' -%}
-    <|Assistant|>
+<｜Assistant｜>
     {%- if message.get('reasoning_content') -%}
-      <think>{{ message['reasoning_content'] }}</think>
+<think>{{ message['reasoning_content'] }}</think>
+    {%- else -%}
+</think>
     {%- endif -%}
-    {{ message['content'] }}<|end_of_sentence|>
+{{ message['content'] }}<｜end▁of▁sentence｜>
+  {%- elif message['role'] == 'tool' -%}
+<｜User｜><tool_result>{{ message['content'] }}</tool_result>
   {%- endif -%}
 {%- endfor -%}
 {%- if add_generation_prompt -%}
-<|Assistant|>
+<｜Assistant｜><think>
 {%- endif -%}

From 9359fe8dc54c66c5c6f35966080883a16db17938 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Thu, 23 Apr 2026 23:19:04 -0700
Subject: [PATCH 08/28] prompt

---
 .github/configs/nvidia-master.yaml            | 14 ++---
 perf-changelog.yaml                           |  6 +--
 ...yaml => disagg-gb200-7p1d-dep8-dep16.yaml} | 48 ++++++++---------
 .../vllm/deepseek-v4/dsv4-chat-template.jinja | 53 -------------------
 4 files changed, 32 insertions(+), 89 deletions(-)
 rename srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-5p1d-dep4-dep8.yaml => disagg-gb200-7p1d-dep8-dep16.yaml} (61%)
 delete mode 100644 srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a6e0c3ce3..25c312ddf 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7442,16 +7442,16 @@ dsv4-fp4-gb200-dynamo-vllm:
   - isl: 8192
     osl: 1024
     search-space:
-    - conc-list: [2048]
+    - conc-list: [4096]
       prefill:
-        num-worker: 5
-        tp: 4
-        ep: 4
+        num-worker: 7
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
-        tp: 8
-        ep: 8
+        tp: 16
+        ep: 16
         dp-attn: true
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1445ad3c7..d028d4457 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,10 +1,10 @@
 - config-keys:
     - dsv4-fp4-gb200-dynamo-vllm
   description:
-    - "Add DeepSeek V4 Flash FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 5p1d)"
+    - "Add DeepSeek V4 Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 7p1d-dep8-dep16)"
     - "Container: vllm/vllm-openai:deepseekv4-cu130"
-    - "Recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
+    - "Mirrors NVIDIA/srt-slurm PR #67; recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
similarity index 61%
rename from srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
rename to srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index afba21415..d97d1de9d 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -1,13 +1,16 @@
-name: "dsv4-vllm-disagg-gb200-5p1d-dep4-dep8"
+name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16"
 
-# Adapted from NVIDIA/srt-slurm PR #67 (deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16).
-# Changes:
-#   * 5p1d-dep4-dep8 topology instead of 7p1d-dep8-dep16
-#   * dynamo source-install pinned to the vllm.inputs.data fix commit (v1.0.2
-#     on PyPI still imports from vllm.inputs.data, which the vLLM in
-#     vllm/vllm-openai:deepseekv4-cu130 no longer exposes)
-#   * DYN_CUSTOM_JINJA_TEMPLATE points at a derived-from-encoding_dsv4 template
-#     since DSV4 ships no Jinja chat_template
+# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra
+# benchmark flag: use_chat_template=false. The HF tokenizer for
+# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's
+# --use-chat-template path calls tokenizer.apply_chat_template() and raises
+# ValueError. Throughput benchmarking uses /v1/completions with random tokens
+# anyway — no chat template needed.
+#
+# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a
+# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/
+# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and
+# uses this native formatter — no custom Jinja template required.
 
 model:
   path: "deepseek-v4-pro"
@@ -15,7 +18,7 @@ model:
   precision: "fp4"
 
 dynamo:
-  hash: d5803cbe71c0035a725652373a175f01942c4a33
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
   install: true
 
 setup_script: vllm-container-deps.sh
@@ -23,12 +26,12 @@ setup_script: vllm-container-deps.sh
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 5
-  decode_nodes: 2
-  prefill_workers: 5
+  prefill_nodes: 14
+  decode_nodes: 4
+  prefill_workers: 7
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 8
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
 
 frontend:
   type: dynamo
@@ -45,7 +48,6 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
-    DYN_CUSTOM_JINJA_TEMPLATE: "/infmax-workspace/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja"
 
   decode_environment:
     TILELANG_CLEANUP_TEMP_FILES: "1"
@@ -54,7 +56,6 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
-    DYN_CUSTOM_JINJA_TEMPLATE: "/infmax-workspace/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja"
 
   vllm_config:
     prefill:
@@ -63,7 +64,7 @@ backend:
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 4
+      data-parallel-size: 8
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       enforce-eager: true
@@ -72,6 +73,7 @@ backend:
       max-num-batched-tokens: 16384
       trust-remote-code: true
       no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
       block-size: 256
       gpu-memory-utilization: 0.88
       no-disable-hybrid-kv-cache-manager: true
@@ -83,7 +85,7 @@ backend:
       kv-cache-dtype: "fp8"
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 8
+      data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: auto
@@ -103,12 +105,6 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "2048"
+  concurrencies: "4096"
   req_rate: "inf"
-  # DSV4's HF tokenizer ships no chat_template (README: "This release does not
-  # include a Jinja-format chat template"). sa-bench's --use-chat-template
-  # path calls tokenizer.apply_chat_template() directly on the HF tokenizer,
-  # which raises ValueError. Send raw random tokens via /v1/completions
-  # instead — correct for throughput benchmarking and matches sa-bench's
-  # warmup path that already succeeded in prior runs.
   use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja
deleted file mode 100644
index b2e8a5f37..000000000
--- a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja
+++ /dev/null
@@ -1,53 +0,0 @@
-{#- DeepSeek-V4 chat template.
-
-Derived from the reference encoding_dsv4 at
-https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/tree/main/encoding
-and README.md (README Quick Start):
-
-    <｜begin▁of▁sentence｜>{system}
-    <｜User｜>{message}<｜Assistant｜><think>{reasoning}</think>{response}<｜end▁of▁sentence｜>
-
-Format rules implemented:
-  * BOS <｜begin▁of▁sentence｜> once at the start, immediately followed by
-    the system prompt inline (no role wrapper).
-  * User turn: <｜User｜>{content}
-  * Assistant turn (thinking mode — DSV4 default):
-      <｜Assistant｜><think>{reasoning_content}</think>{content}<｜end▁of▁sentence｜>
-    Assistant turn (chat mode, no reasoning):
-      <｜Assistant｜></think>{content}<｜end▁of▁sentence｜>
-    (chat mode opens an empty thinking block, per the README.)
-  * add_generation_prompt: <｜Assistant｜><think>  (thinking mode default)
-
-Tool calls and the developer / latest_reminder / quick-instruction roles
-from encoding_dsv4.py are NOT implemented here. sa-bench throughput runs
-use /v1/completions so this template is only evaluated at frontend
-startup (PromptFormatter.from_mdc); it is not invoked per-request. If
-eval via /v1/chat/completions is added, expand this template to match
-encoding_dsv4.py (DSML tool-call format, drop_thinking semantics, etc.).
--#}
-{%- if messages and messages[0]['role'] == 'system' -%}
-  {%- set system_content = messages[0]['content'] -%}
-  {%- set loop_messages = messages[1:] -%}
-{%- else -%}
-  {%- set system_content = '' -%}
-  {%- set loop_messages = messages -%}
-{%- endif -%}
-<｜begin▁of▁sentence｜>{{ system_content }}
-{%- for message in loop_messages -%}
-  {%- if message['role'] == 'user' -%}
-<｜User｜>{{ message['content'] }}
-  {%- elif message['role'] == 'assistant' -%}
-<｜Assistant｜>
-    {%- if message.get('reasoning_content') -%}
-<think>{{ message['reasoning_content'] }}</think>
-    {%- else -%}
-</think>
-    {%- endif -%}
-{{ message['content'] }}<｜end▁of▁sentence｜>
-  {%- elif message['role'] == 'tool' -%}
-<｜User｜><tool_result>{{ message['content'] }}</tool_result>
-  {%- endif -%}
-{%- endfor -%}
-{%- if add_generation_prompt -%}
-<｜Assistant｜><think>
-{%- endif -%}

From 1d51ba1eb6372886de54b24ae2066a8216ab5a5d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 00:35:46 -0700
Subject: [PATCH 09/28] weight loading

---
 runners/launch_gb200-nv.sh                    | 38 +++++++++++++++++++
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    |  8 ++++
 2 files changed, 46 insertions(+)

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index d7d0271e7..db8bf4d4b 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -57,6 +57,44 @@ fi
 export SLURM_PARTITION="batch"
 export SLURM_ACCOUNT="benchmark"
 
+# ---- DSV4 weight pre-stage to compute-node-local NVMe ----
+# DSV4-Pro (~850 GB FP4+FP8 weights) loads too slowly from Lustre: 14 prefill
+# workers contending for the same OSTs stretches the load past srtctl's
+# health-check deadline. Stage once onto /mnt/numa0 (14T local NVMe RAID per
+# compute node) via srun across all 18 batch-partition nodes before launching
+# srtctl. Subsequent runs hit the local copy and skip the rsync via the
+# .stage-complete marker.
+if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
+    LUSTRE_SRC="$MODEL_PATH"
+    STAGED_MODEL_PATH="/mnt/numa0/cache/deepseek-v4-pro"
+    STAGE_MARKER="$STAGED_MODEL_PATH/.stage-complete"
+    # Total node count == prefill_nodes + decode_nodes from the recipe (7p1d-dep8-dep16 = 14+4)
+    STAGE_NODES=18
+
+    echo "Pre-staging DSV4 weights $LUSTRE_SRC -> $STAGED_MODEL_PATH on $STAGE_NODES nodes..."
+    if srun --account="$SLURM_ACCOUNT" --partition="$SLURM_PARTITION" \
+            --nodes="$STAGE_NODES" --ntasks-per-node=1 \
+            --time=40:00 --job-name=dsv4-prestage --exclusive \
+            bash -c '
+                set -e
+                host=$(hostname)
+                if [ -f "'"$STAGE_MARKER"'" ]; then
+                    echo "[$host] already staged, skipping"
+                    exit 0
+                fi
+                mkdir -p "'"$STAGED_MODEL_PATH"'"
+                echo "[$host] rsync start: $(date -u +%H:%M:%S)"
+                time rsync -a --whole-file --info=stats2 "'"$LUSTRE_SRC"'/" "'"$STAGED_MODEL_PATH"'/"
+                touch "'"$STAGE_MARKER"'"
+                echo "[$host] rsync done:  $(date -u +%H:%M:%S)"
+            '; then
+        echo "Pre-stage complete; pointing MODEL_PATH at local copy"
+        export MODEL_PATH="$STAGED_MODEL_PATH"
+    else
+        echo "WARNING: pre-stage failed (srun exit $?); falling back to Lustre MODEL_PATH=$LUSTRE_SRC"
+    fi
+fi
+
 NGINX_IMAGE="nginx:1.27.4"
 
 SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index d97d1de9d..1e96ed90d 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -23,6 +23,14 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Bump health-check from the 1800s default to 2 hours. DSV4-Pro (~850 GB
+# FP4+FP8 weights) loads off Lustre slowly on a cold cache — observed
+# ~33 min for 64 safetensor shards with 14 prefill workers contending for
+# the same OSTs. 1800s isn't enough; 7200s gives headroom.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4

From 4ce52cd06ab388274bcfe5b8387a2ffe57de5c3c Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 02:11:12 -0700
Subject: [PATCH 10/28] sweep

---
 .github/configs/nvidia-master.yaml            |  33 +++++-
 .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml     | 109 ++++++++++++++++++
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 109 ++++++++++++++++++
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    |   2 +-
 4 files changed, 251 insertions(+), 2 deletions(-)
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 25c312ddf..4e8def37f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7442,7 +7442,38 @@ dsv4-fp4-gb200-dynamo-vllm:
   - isl: 8192
     osl: 1024
     search-space:
-    - conc-list: [4096]
+    # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each).
+    # 10 nodes total. Low TTFT/TPOT focus.
+    - conc-list: [4, 8, 16, 32, 128]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml"
+      decode:
+        num-worker: 4
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
+    - conc-list: [512, 1024]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
+    # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
+    - conc-list: [2048, 4096]
       prefill:
         num-worker: 7
         tp: 8
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
new file mode 100644
index 000000000..98f613adf
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -0,0 +1,109 @@
+name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8"
+
+# Interactivity-focused topology: 1 prefill worker + 4 separate decode
+# workers, each at DP=8. Targets conc 4-128 where TTFT/TPOT matter more
+# than aggregate throughput. Same per-worker vllm_config as the NVIDIA
+# 7p1d reference (PR #67); only resources, decode capacity (max-num-seqs
+# / cudagraph capture / batched-tokens), and benchmark concurrencies
+# differ.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 32
+      max-cudagraph-capture-size: 32
+      max-num-batched-tokens: 32
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x128"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..4c59e5a73
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -0,0 +1,109 @@
+name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16"
+
+# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single
+# wide decode (DP=16). Targets conc 512-1024 where a single big decode
+# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d
+# reference (PR #67); only resources, prefill_workers count, and
+# benchmark concurrencies differ. Decode capacity matches 7p1d
+# (max-num-seqs=256) since the decode topology itself is identical.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 1e96ed90d..318362ef1 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -113,6 +113,6 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "4096"
+  concurrencies: "2048x4096"
   req_rate: "inf"
   use_chat_template: false

From 071643b75d115771cb657429f55c943dd68e1961 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 09:31:58 -0700
Subject: [PATCH 11/28] Add 1k/1k DSV4-Pro recipes, comment out 8k/1k for now
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two 1k/1k vLLM disagg recipes extrapolated from kimi-k2.5/1k1k
(scaled to DSV4-Pro's DP>=8-per-worker constraint):
  * disagg-gb200-1p4d-dep8-dep8.yaml — interactivity (conc 4-128), 10 nodes
  * disagg-gb200-1p1d-dep8-dep16.yaml — mid/high throughput
    (conc 256-4096), 6 nodes

Per-recipe tuning vs our 8k/1k baseline:
  * max-model-len 3072 (matches kimi 1k/1k)
  * prefill max-num-seqs 16 (fills 16384-token budget at 1k per seq)
  * decode max-num-seqs 128/512 (shorter KV -> more parallelism)

nvidia-master.yaml changes:
  * Adds the 1k/1k seq-len-config with conc-lists stripped of 4/16/32
  * Comments out the entire 8k/1k block so sweep-enabled runs don't
    re-trigger 8k/1k while 1k/1k numbers are collected. Re-enable by
    uncommenting (instructions at the top of the block).
---
 .github/configs/nvidia-master.yaml            |  91 ++++++++++----
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    | 115 ++++++++++++++++++
 .../1k1k/disagg-gb200-1p4d-dep8-dep8.yaml     | 114 +++++++++++++++++
 3 files changed, 297 insertions(+), 23 deletions(-)
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4e8def37f..282dcf85d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7439,50 +7439,95 @@ dsv4-fp4-gb200-dynamo-vllm:
   multinode: true
   disagg: true
   seq-len-configs:
-  - isl: 8192
+  # 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's
+  # DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg
+  # at this seq-len yet (PR #67 only publishes 8k/1k).
+  - isl: 1024
     osl: 1024
     search-space:
-    # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each).
-    # 10 nodes total. Low TTFT/TPOT focus.
-    - conc-list: [4, 8, 16, 32, 128]
+    # Interactivity: 1 prefill (DP=8) + 4 decodes (DP=8 each). 10 nodes.
+    # NOTE: conc-list was [4, 8, 16, 32, 64, 128]; 4/16/32 dropped to shorten
+    # sweep runtime. Re-add them together with the 8k/1k block below.
+    - conc-list: [8, 64, 128]
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml"
       decode:
         num-worker: 4
         tp: 8
         ep: 8
         dp-attn: true
-    # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
-    - conc-list: [512, 1024]
+    # Mid-to-high throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
+    # 6 nodes. Single prefill is plenty for 1k prompts.
+    - conc-list: [256, 512, 1024, 2048, 3072, 4096]
       prefill:
-        num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
-      decode:
         num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-    # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
-    # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
-    - conc-list: [2048, 4096]
-      prefill:
-        num-worker: 7
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
       decode:
         num-worker: 1
         tp: 16
         ep: 16
         dp-attn: true
+  # ---------------------------------------------------------------------
+  # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the
+  # sweep-enabled gate while we collect 1k/1k data. Re-enable by
+  # uncommenting (remove the leading "# " on every line of the block
+  # below). The conc-lists already have 4/16/32 stripped — add them back
+  # together with the 1k/1k 1p4d block if you want the full sweep again.
+  # ---------------------------------------------------------------------
+  # - isl: 8192
+  #   osl: 1024
+  #   search-space:
+  #   # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each).
+  #   # 10 nodes total. Low TTFT/TPOT focus.
+  #   # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped.
+  #   - conc-list: [8, 128]
+  #     prefill:
+  #       num-worker: 1
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml"
+  #     decode:
+  #       num-worker: 4
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #   # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
+  #   - conc-list: [512, 1024]
+  #     prefill:
+  #       num-worker: 3
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+  #     decode:
+  #       num-worker: 1
+  #       tp: 16
+  #       ep: 16
+  #       dp-attn: true
+  #   # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
+  #   # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
+  #   - conc-list: [2048, 4096]
+  #     prefill:
+  #       num-worker: 7
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+  #     decode:
+  #       num-worker: 1
+  #       tp: 16
+  #       ep: 16
+  #       dp-attn: true
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..779bc8bae
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -0,0 +1,115 @@
+name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16"
+
+# 1k/1k mid-to-high throughput topology. Extrapolated from
+# kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml adjusted for DSV4-Pro's
+# DP>=8 minimum.  Single prefill worker feeding a wide DP=16 decode handles
+# conc 256-4096 cleanly for 1k prompts (prefill throughput per rank is high
+# enough at this prompt length; see kimi precedent).
+#
+# Differences from our 8k1k 7p1d-dep8-dep16:
+#   * prefill_workers: 1 (vs 7) — 1k prompts don't need 14 prefill nodes
+#   * max-model-len: 3072 instead of auto
+#   * prefill max-num-seqs: 16 (fills 16384-token budget at 1k per seq)
+#   * decode max-num-seqs: 512 instead of 256 (shorter KV, more parallelism)
+#   * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024x2048x3072x4096"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
new file mode 100644
index 000000000..c6c6ee1dc
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -0,0 +1,114 @@
+name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8"
+
+# 1k/1k interactivity variant of the 8k/1k recipe with the same name (under
+# ../8k1k/). Extrapolated from kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
+# adjusted for DSV4-Pro's DP>=8 minimum (kimi uses TP=4, we use DP=8 per
+# worker since model layers don't fit at smaller GPU counts).
+#
+# Differences from our 8k1k 1p4d-dep8-dep8:
+#   * max-model-len: 3072 (1024 + 1024 + 1024 headroom) instead of auto/10240
+#   * prefill max-num-seqs: 16 instead of 2 (1k prompts fit 16/batch within
+#     the same 16384 max-num-batched-tokens budget)
+#   * decode max-num-seqs: 128 instead of 32 (shorter KV = more headroom)
+#   * max-cudagraph-capture-size / max-num-batched-tokens (decode): 128
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64x128"
+  req_rate: "inf"
+  use_chat_template: false

From 52b6a2e546012ef14f9472d1c3deec1c6988d5f8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 09:47:25 -0700
Subject: [PATCH 12/28] Bump health-check and add slurm.time_limit to all DSV4
 recipes

Previous run reported "Model did not get healthy in 1800 seconds" on the
1k/1k 1p4d-dep8-dep8 recipe despite health_check.max_attempts being set
to 720. 1800s is the srtctl default, so our override either wasn't
applied or wasn't enough in the face of a cold-cache Lustre load.

Double-down:
  * health_check.max_attempts: 720 -> 1440 (1800s -> 14400s = 4 hours)
  * slurm.time_limit: 8:00:00 explicit (srtslurm.yaml default is 6h,
    make it even wider so the SLURM wall clock can't cut off a slow load)

Applied to all five recipes (1k/1k x2 and 8k/1k x3) so the fix carries
over when the 8k/1k block in nvidia-master.yaml is re-enabled.
---
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml         | 12 +++++++++++-
 .../1k1k/disagg-gb200-1p4d-dep8-dep8.yaml          | 12 +++++++++++-
 .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml          |  5 ++++-
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml         |  5 ++++-
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml         | 14 +++++++++-----
 5 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 779bc8bae..256db4028 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -24,8 +24,18 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so
+# a slow first-time Lustre load + cudagraph capture can't get cut off by the
+# SLURM wall clock.
+slurm:
+  time_limit: "8:00:00"
+
+# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from
+# Lustre with multiple workers contending for the same OSTs — previous 1k/1k
+# run hit the default 1800s. Make this *very* generous since the cost of an
+# over-long deadline is just sitting idle, not wasted compute.
 health_check:
-  max_attempts: 720
+  max_attempts: 1440
   interval_seconds: 10
 
 resources:
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
index c6c6ee1dc..576b7c8c0 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -23,8 +23,18 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so
+# a slow first-time Lustre load + cudagraph capture can't get cut off by the
+# SLURM wall clock.
+slurm:
+  time_limit: "8:00:00"
+
+# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from
+# Lustre with multiple workers contending for the same OSTs — previous 1k/1k
+# run hit the default 1800s. Make this *very* generous since the cost of an
+# over-long deadline is just sitting idle, not wasted compute.
 health_check:
-  max_attempts: 720
+  max_attempts: 1440
   interval_seconds: 10
 
 resources:
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
index 98f613adf..7fa5e47d2 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -18,8 +18,11 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+slurm:
+  time_limit: "8:00:00"
+
 health_check:
-  max_attempts: 720
+  max_attempts: 1440
   interval_seconds: 10
 
 resources:
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
index 4c59e5a73..d6b750bf2 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -18,8 +18,11 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
+slurm:
+  time_limit: "8:00:00"
+
 health_check:
-  max_attempts: 720
+  max_attempts: 1440
   interval_seconds: 10
 
 resources:
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 318362ef1..695db772a 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -23,12 +23,16 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
-# Bump health-check from the 1800s default to 2 hours. DSV4-Pro (~850 GB
-# FP4+FP8 weights) loads off Lustre slowly on a cold cache — observed
-# ~33 min for 64 safetensor shards with 14 prefill workers contending for
-# the same OSTs. 1800s isn't enough; 7200s gives headroom.
+slurm:
+  time_limit: "8:00:00"
+
+# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads
+# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor
+# shards with 14 prefill workers contending for the same OSTs. The first
+# bump to 7200s was still insufficient in one case, so pad generously to
+# 14400s (4h). Over-long deadline only costs idle time, not compute.
 health_check:
-  max_attempts: 720
+  max_attempts: 1440
   interval_seconds: 10
 
 resources:

From 768cddcc8343d9759b0cbf5d5bea70a9324aaeeb Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 12:58:52 -0700
Subject: [PATCH 13/28] Adopt NVIDIA srt-slurm PR #71 recipes (sans offload)
 for 8k/1k DSV4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces our hand-rolled 8k/1k DSV4-Pro vLLM disagg recipes with the
four topologies from NVIDIA/srt-slurm PR #71 (source fork:
alec-flowers/srt-slurm, branch aflowers/dsv4-pr67-pr68, pinned at
commit d60e3f1c). PR #71 supersedes PR #67 that our original 8k/1k
recipes were based on, with more topologies, a wider concurrency
sweep per recipe, new env vars, explicit tokenizer-mode, and CPU/DRAM
expert offload.

We take everything except offload:

  * launch_gb200-nv.sh clones alec-flowers/srt-slurm for dsv4 instead
    of NVIDIA/srt-slurm.
  * Runtime post-clone patch strips `offload-group-size`,
    `offload-num-in-group`, `offload-prefetch-step`, and the commented
    `# offload-params` line from all four 8k/1k recipes.
  * Same post-clone patch injects our `slurm.time_limit: 8:00:00` and
    `health_check: {max_attempts: 1440, interval_seconds: 10}` (4 h
    budget) so the recipes match our cold-cache Lustre load budget.
  * Model-path alias changed from `deepseek-v4-pro` to `deepseekv4-fp4`
    to match PR #71 recipes' `model.path` field; 1k/1k local recipes
    updated to the same alias.
  * nvidia-master.yaml 8k/1k block rewritten: 4 search-space entries
    (1p1d-dep8-dep8, 3p1d-dep8-dep8, 3p1d-dep8-dep16, 6p1d-dep8-dep16),
    each running conc list [4, 8, 16, 32, 64, 256, 512, 1024] — 32 total
    8k/1k benchmark points across 4 cluster startups.
  * Obsolete local 8k/1k recipes under srt-slurm-recipes/vllm/deepseek-v4/8k1k/
    removed (superseded by the PR #71 upstream files).

1k/1k sweep is unchanged otherwise (2 matrix entries, 9 benchmark
points using the hand-rolled recipes — no PR #71 equivalent at 1k/1k).
---
 .github/configs/nvidia-master.yaml            | 119 +++++++++--------
 runners/launch_gb200-nv.sh                    |  47 ++++++-
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    |   2 +-
 .../1k1k/disagg-gb200-1p4d-dep8-dep8.yaml     |   2 +-
 .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml     | 112 ----------------
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 112 ----------------
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 122 ------------------
 7 files changed, 109 insertions(+), 407 deletions(-)
 delete mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
 delete mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
 delete mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 8f294462e..3841ed833 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7497,58 +7497,67 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
-  # ---------------------------------------------------------------------
-  # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the
-  # sweep-enabled gate while we collect 1k/1k data. Re-enable by
-  # uncommenting (remove the leading "# " on every line of the block
-  # below). The conc-lists already have 4/16/32 stripped — add them back
-  # together with the 1k/1k 1p4d block if you want the full sweep again.
-  # ---------------------------------------------------------------------
-  # - isl: 8192
-  #   osl: 1024
-  #   search-space:
-  #   # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each).
-  #   # 10 nodes total. Low TTFT/TPOT focus.
-  #   # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped.
-  #   - conc-list: [8, 128]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml"
-  #     decode:
-  #       num-worker: 4
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #   # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
-  #   - conc-list: [512, 1024]
-  #     prefill:
-  #       num-worker: 3
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 16
-  #       ep: 16
-  #       dp-attn: true
-  #   # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
-  #   # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
-  #   - conc-list: [2048, 4096]
-  #     prefill:
-  #       num-worker: 7
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 16
-  #       ep: 16
-  #       dp-attn: true
+  # 8k/1k — four topologies from NVIDIA/srt-slurm PR #71 (the alec-flowers
+  # fork is cloned instead of NVIDIA/srt-slurm and patched at runtime to
+  # strip CPU/DRAM expert offload). Each recipe runs the full conc list
+  # [4, 8, 16, 32, 64, 256, 512, 1024] (8 points) giving cross-topology
+  # coverage. Total 8k/1k points: 32.
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # 1p1d-dep8-dep8 — 1 prefill + 1 decode, each DP=8. 4 nodes.
+    - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # 3p1d-dep8-dep8 — 3 prefill + 1 decode, each DP=8. 8 nodes.
+    - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-32-c2048-offload.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    # 3p1d-dep8-dep16 — 3 prefill (DP=8) + 1 wide decode (DP=16). 10 nodes.
+    - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c4096-offload.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    # 6p1d-dep8-dep16 — 6 prefill (DP=8) + 1 wide decode (DP=16). 16 nodes.
+    - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024]
+      prefill:
+        num-worker: 6
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-6p1d-dep8-dep16-64-c8192-offload.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index db8bf4d4b..da321bdb4 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -43,8 +43,10 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
         export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
     elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+        # Model path alias matches NVIDIA srt-slurm PR #71 recipes
+        # (`model.path: "deepseekv4-fp4"`).
         export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Pro"
-        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
+        export SRT_SLURM_MODEL_PREFIX="deepseekv4-fp4"
     else
         echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4"
         exit 1
@@ -66,7 +68,7 @@ export SLURM_ACCOUNT="benchmark"
 # .stage-complete marker.
 if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
     LUSTRE_SRC="$MODEL_PATH"
-    STAGED_MODEL_PATH="/mnt/numa0/cache/deepseek-v4-pro"
+    STAGED_MODEL_PATH="/mnt/numa0/cache/deepseekv4-fp4"
     STAGE_MARKER="$STAGED_MODEL_PATH/.stage-complete"
     # Total node count == prefill_nodes + decode_nodes from the recipe (7p1d-dep8-dep16 = 14+4)
     STAGE_NODES=18
@@ -176,10 +178,47 @@ if [ -d "$SRT_REPO_DIR" ]; then
 fi
 
 if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
-    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    # alec-flowers/srt-slurm, branch aflowers/dsv4-pr67-pr68
+    # (https://github.com/NVIDIA/srt-slurm/pull/71) — supersedes PR #67 with
+    # 4 GB200 DSV4-Pro vLLM disagg recipes (1p1d, 3p1d-dep8, 3p1d-dep16,
+    # 6p1d-dep16), NUMA binding, new env vars, and explicit tokenizer-mode.
+    # Pinned to PR #71 head for reproducibility.
+    git clone https://github.com/alec-flowers/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
-    git checkout sa-submission-q2-2026
+    git checkout d60e3f1c7921721e52af01afaab59a70a1631106
+    # Copy our hand-rolled 1k/1k recipes (no upstream equivalent for vLLM
+    # disagg at 1k/1k yet). 8k/1k recipes come from the upstream clone.
     cp -r "$GITHUB_WORKSPACE/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
+    # PR #71's 8k/1k recipes include CPU/DRAM expert offload (offload-*
+    # knobs + a companion vllm_numa_bind_hash_fix.py patch). Strip the
+    # offload lines and inject our health_check + slurm.time_limit
+    # overrides so the recipes run without offload and with a generous
+    # cold-cache Lustre load budget.
+    python3 - <<'PY'
+from pathlib import Path
+for p in Path("recipes/vllm/deepseek-v4-pro/8k1k").glob("disagg-gb200-*.yaml"):
+    text = p.read_text()
+    # Drop offload-* knobs and the commented `# offload-params:` line.
+    kept = []
+    for line in text.splitlines():
+        stripped = line.lstrip()
+        if stripped.startswith("offload-") or stripped.startswith("# offload-params:"):
+            continue
+        kept.append(line)
+    text = "\n".join(kept) + ("\n" if text.endswith("\n") else "")
+    # Inject slurm.time_limit and health_check overrides after setup_script.
+    marker = "setup_script: vllm-container-deps.sh\n"
+    if marker in text and "health_check:" not in text:
+        text = text.replace(
+            marker,
+            marker
+            + "\nslurm:\n  time_limit: \"8:00:00\"\n"
+            + "\nhealth_check:\n  max_attempts: 1440\n  interval_seconds: 10\n",
+            1,
+        )
+    p.write_text(text)
+    print(f"patched {p}")
+PY
 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 256db4028..4204c26b5 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -14,7 +14,7 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16"
 #   * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512
 
 model:
-  path: "deepseek-v4-pro"
+  path: "deepseekv4-fp4"
   container: "vllm/vllm-openai:deepseekv4-cu130"
   precision: "fp4"
 
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
index 576b7c8c0..9981de640 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -13,7 +13,7 @@ name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8"
 #   * max-cudagraph-capture-size / max-num-batched-tokens (decode): 128
 
 model:
-  path: "deepseek-v4-pro"
+  path: "deepseekv4-fp4"
   container: "vllm/vllm-openai:deepseekv4-cu130"
   precision: "fp4"
 
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
deleted file mode 100644
index 7fa5e47d2..000000000
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ /dev/null
@@ -1,112 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8"
-
-# Interactivity-focused topology: 1 prefill worker + 4 separate decode
-# workers, each at DP=8. Targets conc 4-128 where TTFT/TPOT matter more
-# than aggregate throughput. Same per-worker vllm_config as the NVIDIA
-# 7p1d reference (PR #67); only resources, decode capacity (max-num-seqs
-# / cudagraph capture / batched-tokens), and benchmark concurrencies
-# differ.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 8
-  prefill_workers: 1
-  decode_workers: 4
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: auto
-      max-num-seqs: 32
-      max-cudagraph-capture-size: 32
-      max-num-batched-tokens: 32
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x16x32x128"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
deleted file mode 100644
index d6b750bf2..000000000
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,112 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16"
-
-# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single
-# wide decode (DP=16). Targets conc 512-1024 where a single big decode
-# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d
-# reference (PR #67); only resources, prefill_workers count, and
-# benchmark concurrencies differ. Decode capacity matches 7p1d
-# (max-num-seqs=256) since the decode topology itself is identical.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 6
-  decode_nodes: 4
-  prefill_workers: 3
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: auto
-      max-num-seqs: 256
-      max-cudagraph-capture-size: 256
-      max-num-batched-tokens: 256
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "512x1024"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
deleted file mode 100644
index 695db772a..000000000
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ /dev/null
@@ -1,122 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16"
-
-# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra
-# benchmark flag: use_chat_template=false. The HF tokenizer for
-# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's
-# --use-chat-template path calls tokenizer.apply_chat_template() and raises
-# ValueError. Throughput benchmarking uses /v1/completions with random tokens
-# anyway — no chat template needed.
-#
-# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a
-# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/
-# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and
-# uses this native formatter — no custom Jinja template required.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads
-# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor
-# shards with 14 prefill workers contending for the same OSTs. The first
-# bump to 7200s was still insufficient in one case, so pad generously to
-# 14400s (4h). Over-long deadline only costs idle time, not compute.
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 14
-  decode_nodes: 4
-  prefill_workers: 7
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: auto
-      max-num-seqs: 256
-      max-cudagraph-capture-size: 256
-      max-num-batched-tokens: 256
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "2048x4096"
-  req_rate: "inf"
-  use_chat_template: false

From af10ca0c63ffeee76276db421ed8185823e9737e Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 13:16:39 -0700
Subject: [PATCH 14/28] path

---
 runners/launch_gb200-nv.sh | 43 +++-----------------------------------
 1 file changed, 3 insertions(+), 40 deletions(-)

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index da321bdb4..40a884086 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -44,8 +44,9 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
         export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
     elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
         # Model path alias matches NVIDIA srt-slurm PR #71 recipes
-        # (`model.path: "deepseekv4-fp4"`).
-        export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Pro"
+        # (`model.path: "deepseekv4-fp4"`). Weights live on compute-node
+        # local NVMe (/mnt/numa1) for fast startup — no Lustre contention.
+        export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
         export SRT_SLURM_MODEL_PREFIX="deepseekv4-fp4"
     else
         echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4"
@@ -59,44 +60,6 @@ fi
 export SLURM_PARTITION="batch"
 export SLURM_ACCOUNT="benchmark"
 
-# ---- DSV4 weight pre-stage to compute-node-local NVMe ----
-# DSV4-Pro (~850 GB FP4+FP8 weights) loads too slowly from Lustre: 14 prefill
-# workers contending for the same OSTs stretches the load past srtctl's
-# health-check deadline. Stage once onto /mnt/numa0 (14T local NVMe RAID per
-# compute node) via srun across all 18 batch-partition nodes before launching
-# srtctl. Subsequent runs hit the local copy and skip the rsync via the
-# .stage-complete marker.
-if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
-    LUSTRE_SRC="$MODEL_PATH"
-    STAGED_MODEL_PATH="/mnt/numa0/cache/deepseekv4-fp4"
-    STAGE_MARKER="$STAGED_MODEL_PATH/.stage-complete"
-    # Total node count == prefill_nodes + decode_nodes from the recipe (7p1d-dep8-dep16 = 14+4)
-    STAGE_NODES=18
-
-    echo "Pre-staging DSV4 weights $LUSTRE_SRC -> $STAGED_MODEL_PATH on $STAGE_NODES nodes..."
-    if srun --account="$SLURM_ACCOUNT" --partition="$SLURM_PARTITION" \
-            --nodes="$STAGE_NODES" --ntasks-per-node=1 \
-            --time=40:00 --job-name=dsv4-prestage --exclusive \
-            bash -c '
-                set -e
-                host=$(hostname)
-                if [ -f "'"$STAGE_MARKER"'" ]; then
-                    echo "[$host] already staged, skipping"
-                    exit 0
-                fi
-                mkdir -p "'"$STAGED_MODEL_PATH"'"
-                echo "[$host] rsync start: $(date -u +%H:%M:%S)"
-                time rsync -a --whole-file --info=stats2 "'"$LUSTRE_SRC"'/" "'"$STAGED_MODEL_PATH"'/"
-                touch "'"$STAGE_MARKER"'"
-                echo "[$host] rsync done:  $(date -u +%H:%M:%S)"
-            '; then
-        echo "Pre-stage complete; pointing MODEL_PATH at local copy"
-        export MODEL_PATH="$STAGED_MODEL_PATH"
-    else
-        echo "WARNING: pre-stage failed (srun exit $?); falling back to Lustre MODEL_PATH=$LUSTRE_SRC"
-    fi
-fi
-
 NGINX_IMAGE="nginx:1.27.4"
 
 SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"

From f5245845e92ae8527774b2f5e47098ad916438c7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 14:06:01 -0700
Subject: [PATCH 15/28] Revert "Adopt NVIDIA srt-slurm PR #71 recipes (sans
 offload) for 8k/1k DSV4"

This reverts commit 768cddcc8343d9759b0cbf5d5bea70a9324aaeeb.
---
 .github/configs/nvidia-master.yaml            | 119 ++++++++---------
 runners/launch_gb200-nv.sh                    |  49 +------
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    |   2 +-
 .../1k1k/disagg-gb200-1p4d-dep8-dep8.yaml     |   2 +-
 .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml     | 112 ++++++++++++++++
 .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 112 ++++++++++++++++
 .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml    | 122 ++++++++++++++++++
 7 files changed, 409 insertions(+), 109 deletions(-)
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3841ed833..8f294462e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7497,67 +7497,58 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
-  # 8k/1k — four topologies from NVIDIA/srt-slurm PR #71 (the alec-flowers
-  # fork is cloned instead of NVIDIA/srt-slurm and patched at runtime to
-  # strip CPU/DRAM expert offload). Each recipe runs the full conc list
-  # [4, 8, 16, 32, 64, 256, 512, 1024] (8 points) giving cross-topology
-  # coverage. Total 8k/1k points: 32.
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # 1p1d-dep8-dep8 — 1 prefill + 1 decode, each DP=8. 4 nodes.
-    - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-    # 3p1d-dep8-dep8 — 3 prefill + 1 decode, each DP=8. 8 nodes.
-    - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024]
-      prefill:
-        num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-32-c2048-offload.yaml"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-    # 3p1d-dep8-dep16 — 3 prefill (DP=8) + 1 wide decode (DP=16). 10 nodes.
-    - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024]
-      prefill:
-        num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c4096-offload.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
-    # 6p1d-dep8-dep16 — 6 prefill (DP=8) + 1 wide decode (DP=16). 16 nodes.
-    - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024]
-      prefill:
-        num-worker: 6
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-6p1d-dep8-dep16-64-c8192-offload.yaml"
-      decode:
-        num-worker: 1
-        tp: 16
-        ep: 16
-        dp-attn: true
+  # ---------------------------------------------------------------------
+  # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the
+  # sweep-enabled gate while we collect 1k/1k data. Re-enable by
+  # uncommenting (remove the leading "# " on every line of the block
+  # below). The conc-lists already have 4/16/32 stripped — add them back
+  # together with the 1k/1k 1p4d block if you want the full sweep again.
+  # ---------------------------------------------------------------------
+  # - isl: 8192
+  #   osl: 1024
+  #   search-space:
+  #   # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each).
+  #   # 10 nodes total. Low TTFT/TPOT focus.
+  #   # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped.
+  #   - conc-list: [8, 128]
+  #     prefill:
+  #       num-worker: 1
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml"
+  #     decode:
+  #       num-worker: 4
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #   # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
+  #   - conc-list: [512, 1024]
+  #     prefill:
+  #       num-worker: 3
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+  #     decode:
+  #       num-worker: 1
+  #       tp: 16
+  #       ep: 16
+  #       dp-attn: true
+  #   # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
+  #   # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
+  #   - conc-list: [2048, 4096]
+  #     prefill:
+  #       num-worker: 7
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+  #     decode:
+  #       num-worker: 1
+  #       tp: 16
+  #       ep: 16
+  #       dp-attn: true
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 40a884086..6c8e706f1 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -43,11 +43,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
         export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
     elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
-        # Model path alias matches NVIDIA srt-slurm PR #71 recipes
-        # (`model.path: "deepseekv4-fp4"`). Weights live on compute-node
-        # local NVMe (/mnt/numa1) for fast startup — no Lustre contention.
+        # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre
+        # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the
+        # model.path alias in our DSV4 recipes.
         export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
-        export SRT_SLURM_MODEL_PREFIX="deepseekv4-fp4"
+        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
     else
         echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4"
         exit 1
@@ -141,47 +141,10 @@ if [ -d "$SRT_REPO_DIR" ]; then
 fi
 
 if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
-    # alec-flowers/srt-slurm, branch aflowers/dsv4-pr67-pr68
-    # (https://github.com/NVIDIA/srt-slurm/pull/71) — supersedes PR #67 with
-    # 4 GB200 DSV4-Pro vLLM disagg recipes (1p1d, 3p1d-dep8, 3p1d-dep16,
-    # 6p1d-dep16), NUMA binding, new env vars, and explicit tokenizer-mode.
-    # Pinned to PR #71 head for reproducibility.
-    git clone https://github.com/alec-flowers/srt-slurm.git "$SRT_REPO_DIR"
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
-    git checkout d60e3f1c7921721e52af01afaab59a70a1631106
-    # Copy our hand-rolled 1k/1k recipes (no upstream equivalent for vLLM
-    # disagg at 1k/1k yet). 8k/1k recipes come from the upstream clone.
+    git checkout sa-submission-q2-2026
     cp -r "$GITHUB_WORKSPACE/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
-    # PR #71's 8k/1k recipes include CPU/DRAM expert offload (offload-*
-    # knobs + a companion vllm_numa_bind_hash_fix.py patch). Strip the
-    # offload lines and inject our health_check + slurm.time_limit
-    # overrides so the recipes run without offload and with a generous
-    # cold-cache Lustre load budget.
-    python3 - <<'PY'
-from pathlib import Path
-for p in Path("recipes/vllm/deepseek-v4-pro/8k1k").glob("disagg-gb200-*.yaml"):
-    text = p.read_text()
-    # Drop offload-* knobs and the commented `# offload-params:` line.
-    kept = []
-    for line in text.splitlines():
-        stripped = line.lstrip()
-        if stripped.startswith("offload-") or stripped.startswith("# offload-params:"):
-            continue
-        kept.append(line)
-    text = "\n".join(kept) + ("\n" if text.endswith("\n") else "")
-    # Inject slurm.time_limit and health_check overrides after setup_script.
-    marker = "setup_script: vllm-container-deps.sh\n"
-    if marker in text and "health_check:" not in text:
-        text = text.replace(
-            marker,
-            marker
-            + "\nslurm:\n  time_limit: \"8:00:00\"\n"
-            + "\nhealth_check:\n  max_attempts: 1440\n  interval_seconds: 10\n",
-            1,
-        )
-    p.write_text(text)
-    print(f"patched {p}")
-PY
 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 4204c26b5..256db4028 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -14,7 +14,7 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16"
 #   * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512
 
 model:
-  path: "deepseekv4-fp4"
+  path: "deepseek-v4-pro"
   container: "vllm/vllm-openai:deepseekv4-cu130"
   precision: "fp4"
 
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
index 9981de640..576b7c8c0 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -13,7 +13,7 @@ name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8"
 #   * max-cudagraph-capture-size / max-num-batched-tokens (decode): 128
 
 model:
-  path: "deepseekv4-fp4"
+  path: "deepseek-v4-pro"
   container: "vllm/vllm-openai:deepseekv4-cu130"
   precision: "fp4"
 
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
new file mode 100644
index 000000000..7fa5e47d2
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -0,0 +1,112 @@
+name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8"
+
+# Interactivity-focused topology: 1 prefill worker + 4 separate decode
+# workers, each at DP=8. Targets conc 4-128 where TTFT/TPOT matter more
+# than aggregate throughput. Same per-worker vllm_config as the NVIDIA
+# 7p1d reference (PR #67); only resources, decode capacity (max-num-seqs
+# / cudagraph capture / batched-tokens), and benchmark concurrencies
+# differ.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 32
+      max-cudagraph-capture-size: 32
+      max-num-batched-tokens: 32
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x128"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..d6b750bf2
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -0,0 +1,112 @@
+name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16"
+
+# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single
+# wide decode (DP=16). Targets conc 512-1024 where a single big decode
+# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d
+# reference (PR #67); only resources, prefill_workers count, and
+# benchmark concurrencies differ. Decode capacity matches 7p1d
+# (max-num-seqs=256) since the decode topology itself is identical.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..695db772a
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -0,0 +1,122 @@
+name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16"
+
+# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra
+# benchmark flag: use_chat_template=false. The HF tokenizer for
+# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's
+# --use-chat-template path calls tokenizer.apply_chat_template() and raises
+# ValueError. Throughput benchmarking uses /v1/completions with random tokens
+# anyway — no chat template needed.
+#
+# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a
+# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/
+# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and
+# uses this native formatter — no custom Jinja template required.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads
+# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor
+# shards with 14 prefill workers contending for the same OSTs. The first
+# bump to 7200s was still insufficient in one case, so pad generously to
+# 14400s (4h). Over-long deadline only costs idle time, not compute.
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 14
+  decode_nodes: 4
+  prefill_workers: 7
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048x4096"
+  req_rate: "inf"
+  use_chat_template: false

From 18100e54697ad26ecb18ceb08c58dc0568afbdb4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 14:47:18 -0700
Subject: [PATCH 16/28] Add 1k/1k 3p1d-dep8-dep16 recipe for high concurrency
 (4096, 8192)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The existing 1k/1k 1p1d-dep8-dep16 recipe runs out of prefill at
conc>=8192 — single DP=8 prefill worker can sustain ~80-150K tok/s,
not the ~200-300K tok/s of demand at conc=8192. New 3p1d-dep8-dep16
recipe adds 2 more prefill workers (10 nodes total).

Decode capacity bumped to max-num-seqs=1024 (vs 512 in 1p1d) so
conc=8192 has headroom (per-rank 8192/16 = 512, well below 1024).
max-cudagraph-capture-size kept at 512 — steady-state per-rank batch
is ~512 so cudagraphs still apply.

conc-list overlap at 4096 between the two topologies gives a direct
crossover comparison point.
---
 .github/configs/nvidia-master.yaml            |  20 ++-
 .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml    | 117 ++++++++++++++++++
 2 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 8f294462e..6bac5ee98 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7482,8 +7482,8 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 8
         ep: 8
         dp-attn: true
-    # Mid-to-high throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
-    # 6 nodes. Single prefill is plenty for 1k prompts.
+    # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
+    # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
     - conc-list: [256, 512, 1024, 2048, 3072, 4096]
       prefill:
         num-worker: 1
@@ -7497,6 +7497,22 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+    # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
+    # The 4096 overlap with the 1p1d block gives a crossover point. 8192
+    # would saturate 1p1d's prefill, so this topology takes over there.
+    - conc-list: [4096, 8192]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
   # ---------------------------------------------------------------------
   # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the
   # sweep-enabled gate while we collect 1k/1k data. Re-enable by
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..63e9e280c
--- /dev/null
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -0,0 +1,117 @@
+name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16"
+
+# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single
+# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those
+# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling)
+# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s
+# exceeds what one DP=8 worker can sustain.
+#
+# Decode capacity:
+#   max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which
+#   leaves headroom over the conc=8192 working set (per-rank avg 512).
+#   max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is
+#   ~512 so cudagraphs still apply at steady state.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 1024
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 1024
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4096x8192"
+  req_rate: "inf"
+  use_chat_template: false

From 84be0b3a1b21007386e9f6a7cb82c8a3deda7abe Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 14:48:50 -0700
Subject: [PATCH 17/28] change concs

---
 .../vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml     | 2 +-
 .../vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml      | 2 +-
 .../vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml      | 2 +-
 .../vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 256db4028..75b3d2770 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -120,6 +120,6 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "256x512x1024x2048x3072x4096"
+  concurrencies: "256x512x1024x2048x4096"
   req_rate: "inf"
   use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
index 576b7c8c0..59427712c 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -119,6 +119,6 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "4x8x16x32x64x128"
+  concurrencies: "8x32x64x128"
   req_rate: "inf"
   use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
index 7fa5e47d2..ef6dcdc24 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -107,6 +107,6 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "4x8x16x32x128"
+  concurrencies: "8x32x128"
   req_rate: "inf"
   use_chat_template: false
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
index 695db772a..6213373b3 100644
--- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
+++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -117,6 +117,6 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "2048x4096"
+  concurrencies: "4096x8192"
   req_rate: "inf"
   use_chat_template: false

From 8b1fbe29fbc277c775318aa1dfaa7a353cf593e4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 15:41:09 -0700
Subject: [PATCH 18/28] Move srt-slurm-recipes/ under benchmarks/multi_node/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recipes are part of the multi-node benchmark plumbing — they belong
next to the other multi-node assets (amd_utils/, dsr1_*_sglang-disagg.sh,
gptoss_fp4_gb200_dynamo-trt.sh) rather than at the repo root.

Updates the launch script's `cp -r` source path. The reference in
perf-changelog.yaml's historical entry is left untouched (additions-only
gate; it's only a description string).
---
 .../vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml     | 0
 .../vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml      | 0
 .../vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml     | 0
 .../vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml      | 0
 .../vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml     | 0
 .../vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml     | 0
 runners/launch_gb200-nv.sh                                      | 2 +-
 7 files changed, 1 insertion(+), 1 deletion(-)
 rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml (100%)
 rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml (100%)
 rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml (100%)
 rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml (100%)
 rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml (100%)
 rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml (100%)

diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
similarity index 100%
rename from srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
similarity index 100%
rename from srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
similarity index 100%
rename from srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
similarity index 100%
rename from srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
similarity index 100%
rename from srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
similarity index 100%
rename from srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 6c8e706f1..45d49c09b 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -144,7 +144,7 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q2-2026
-    cp -r "$GITHUB_WORKSPACE/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
+    cp -r "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"

From e095e00b8cdb7c1f37d82819593614051e1f9220 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 16:17:33 -0700
Subject: [PATCH 19/28] Add 1p4d-dep8-tep8 TEP recipes for low concurrency
 (1k/1k + 8k/1k)

Decode workers use TP=8 within each worker (no data-parallel decode),
sheds attention-layer memory pressure compared to the dep8-dep8 sibling
at the cost of an inter-rank TP all-reduce per attention layer.

Each rank holds:
  * dep8 sibling: full attention replica + 1/8 of experts (EP=8)
  * tep8 (this):  1/8 of attention (TP=8 sharded) + 1/8 experts (EP=8)

Same node count (10) and same conc-list as the dep8-dep8 sibling so the
two are directly comparable. Useful at low concurrency where TP
all-reduce overhead is a smaller fraction of step time.

Topology pattern derived from kimi-k2.5/{1k1k,8k1k}/disagg-gb200-1p4d-
dep4-tep4.yaml (the only vLLM disagg TEP precedent on GB200 in upstream
srt-slurm). Scaled to TP=8 because DSV4-Pro's attention layers don't fit
the per-rank budget at TP=4.

nvidia-master.yaml:
  * Adds the 1k/1k TEP entry as a sibling to the existing dep8-dep8 entry
    (same conc-list [8, 64, 128], active).
  * Adds the 8k/1k TEP entry inside the still-commented 8k/1k block
    (conc-list [8, 128]) so it's present when 8k/1k is re-enabled.
---
 .github/configs/nvidia-master.yaml            |  38 +++++-
 .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml    |   2 +-
 .../1k1k/disagg-gb200-1p4d-dep8-tep8.yaml     | 125 ++++++++++++++++++
 .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml     |   2 +-
 .../8k1k/disagg-gb200-1p4d-dep8-tep8.yaml     | 119 +++++++++++++++++
 5 files changed, 282 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 6bac5ee98..d1829c64b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7466,7 +7466,8 @@ dsv4-fp4-gb200-dynamo-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    # Interactivity: 1 prefill (DP=8) + 4 decodes (DP=8 each). 10 nodes.
+    # Interactivity (DP-decode): 1 prefill (DP=8) + 4 decodes (DP=8 each).
+    # 10 nodes. Each decode rank holds full attention replica + 1/8 experts.
     # NOTE: conc-list was [4, 8, 16, 32, 64, 128]; 4/16/32 dropped to shorten
     # sweep runtime. Re-add them together with the 8k/1k block below.
     - conc-list: [8, 64, 128]
@@ -7482,6 +7483,24 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 8
         ep: 8
         dp-attn: true
+    # Interactivity (TEP-decode): 1 prefill (DP=8) + 4 decodes (TP=8 each).
+    # 10 nodes — same node count as the dep8-dep8 sibling. Each decode rank
+    # holds 1/8 of attention (TP-sharded) + 1/8 of experts (EP), trading
+    # weight-memory headroom for an inter-rank TP all-reduce per attention
+    # layer. Same conc-list as the dep8 entry so they're directly comparable.
+    - conc-list: [8, 64, 128]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml"
+      decode:
+        num-worker: 4
+        tp: 8
+        ep: 1
+        dp-attn: false
     # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
     # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
     - conc-list: [256, 512, 1024, 2048, 3072, 4096]
@@ -7523,7 +7542,7 @@ dsv4-fp4-gb200-dynamo-vllm:
   # - isl: 8192
   #   osl: 1024
   #   search-space:
-  #   # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each).
+  #   # Interactivity (DP-decode): 1 prefill (DP=8) + 4 decodes (DP=8 each).
   #   # 10 nodes total. Low TTFT/TPOT focus.
   #   # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped.
   #   - conc-list: [8, 128]
@@ -7539,6 +7558,21 @@ dsv4-fp4-gb200-dynamo-vllm:
   #       tp: 8
   #       ep: 8
   #       dp-attn: true
+  #   # Interactivity (TEP-decode): 1 prefill (DP=8) + 4 decodes (TP=8 each).
+  #   # 10 nodes. Same conc-list as the dep8 sibling for direct A/B.
+  #   - conc-list: [8, 128]
+  #     prefill:
+  #       num-worker: 1
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml"
+  #     decode:
+  #       num-worker: 4
+  #       tp: 8
+  #       ep: 1
+  #       dp-attn: false
   #   # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
   #   - conc-list: [512, 1024]
   #     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
index 75b3d2770..bf5b441b9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -120,6 +120,6 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "256x512x1024x2048x4096"
+  concurrencies: "128x256x1024x2048x4096"
   req_rate: "inf"
   use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml
new file mode 100644
index 000000000..b4567b5ce
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml
@@ -0,0 +1,125 @@
+name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tep8"
+
+# 1k/1k TEP variant for low concurrency (4-128).
+#
+# Decode workers use tensor parallelism (TP=8) within each worker instead
+# of data parallelism. Each rank holds 1/8 of attention/embedding (sharded)
+# plus 1/8 of experts (EP) — vs the dep8 variant where each rank holds the
+# full attention replica plus 1/8 of experts. TEP frees ~80-160 GB per rank
+# of weight memory at the cost of an inter-rank TP all-reduce on every
+# attention layer. At low conc (where attention all-reduce overhead is a
+# smaller fraction of step time), this can be a net win on TTFT/TPOT.
+#
+# Topology: 1 prefill (DP=8) + 4 decode (TP=8 each). 10 nodes. Same node
+# count as 1p4d-dep8-dep8, different memory split.
+#
+# Extrapolated from kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml — the
+# only vLLM disagg TEP precedent on GB200 in upstream srt-slurm. Scaled
+# from kimi's TP=4 to TP=8 because DSV4-Pro is too large to TP-shard at 4.
+# No upstream NVIDIA reference for DSV4-Pro TEP yet.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      # TP=8 across 8 GPUs (one node per worker). No data-parallel-size.
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 3072
+      # 4 decode workers x 128 = 512 total simultaneous slots, well above
+      # max conc=128 in this entry. KV is TP=8-sharded so per-rank KV is
+      # 1/8 the dep8 case; we can afford the larger max-num-seqs.
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "8x64x128"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
index ef6dcdc24..0b000b8e3 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
@@ -107,6 +107,6 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "8x32x128"
+  concurrencies: "4x8x32x64"
   req_rate: "inf"
   use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml
new file mode 100644
index 000000000..e11c9a361
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml
@@ -0,0 +1,119 @@
+name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tep8"
+
+# 8k/1k TEP variant for low concurrency (4-128).
+#
+# See ../1k1k/disagg-gb200-1p4d-dep8-tep8.yaml for the rationale (TP=8
+# decode workers shed attention-layer memory pressure vs the dep8 sibling
+# at the cost of TP all-reduce latency).
+#
+# Differences from the 1k/1k version:
+#   * max-model-len: auto (matches NVIDIA PR #67-style 8k/1k recipes)
+#   * prefill max-num-seqs: 2 (NVIDIA's value — 8k inputs fill the 16384
+#     max-num-batched-tokens budget at 2 prefills/batch)
+#   * decode max-num-seqs: 64 (KV is 8x larger per request than 1k/1k;
+#     even with TP=8 sharding, conservative max-num-seqs vs the 128 in
+#     the 1k/1k sibling. 4 workers x 64 = 256 simultaneous, plenty for
+#     the conc=128 max in this entry.)
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      # TP=8 across 8 GPUs (one node per worker). No data-parallel-size.
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "8x128"
+  req_rate: "inf"
+  use_chat_template: false

From 4666f607cf26fbb7ea064440cc971e5cd7b3647a Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 16:27:39 -0700
Subject: [PATCH 20/28] conc changes

---
 .../vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml      | 2 +-
 .../vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml
index b4567b5ce..049d6d55f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml
@@ -120,6 +120,6 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "8x64x128"
+  concurrencies: "4x16x64x128"
   req_rate: "inf"
   use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml
index e11c9a361..6dee55304 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml
@@ -114,6 +114,6 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "8x128"
+  concurrencies: "4x16x64x128"
   req_rate: "inf"
   use_chat_template: false

From a51db718a5770602736308a81d08660a43307142 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 16:38:23 -0700
Subject: [PATCH 21/28] perfchangelog

---
 perf-changelog.yaml | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e53e2f66a..84737bd78 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,12 +1,4 @@
 - config-keys:
-<<<<<<< dsv4-fp4-gb200-dynamo-vllm-disagg
-    - dsv4-fp4-gb200-dynamo-vllm
-  description:
-    - "Add DeepSeek V4 Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 7p1d-dep8-dep16)"
-    - "Container: vllm/vllm-openai:deepseekv4-cu130"
-    - "Mirrors NVIDIA/srt-slurm PR #67; recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129
-=======
     - dsv4-fp4-b200-sglang
   description:
     - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)"
@@ -15,7 +7,6 @@
     - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config"
     - "Prefix caching and speculative decoding disabled for baseline numbers"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131
->>>>>>> main
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
@@ -1775,3 +1766,11 @@
     - "Prefix caching disabled, no speculative decoding"
     - "Configs: 1k1k conc 4-1024, 8k1k conc 4-512"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143
+
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "Add DeepSeek V4 Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 7p1d-dep8-dep16)"
+    - "Container: vllm/vllm-openai:deepseekv4-cu130"
+    - "Mirrors NVIDIA/srt-slurm PR #67; recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129

From c23c9fa9560724b71ea8f89b5001546782c0cfc0 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 18:03:17 -0700
Subject: [PATCH 22/28] Undo 1p4d-dep8-tep8 TEP recipes

Reverts the experimental TEP-decode variant for low concurrency. Removes
both 1k/1k and 8k/1k recipe files plus the active 1k/1k search-space
entry and the (still-commented) 8k/1k entry in nvidia-master.yaml.
Reverts the 'Interactivity (DP-decode)' / 'Interactivity (TEP-decode)'
naming back to plain 'Interactivity' on the dep8-dep8 entries.
---
 .github/configs/nvidia-master.yaml            |  38 +-----
 .../1k1k/disagg-gb200-1p4d-dep8-tep8.yaml     | 125 ------------------
 .../8k1k/disagg-gb200-1p4d-dep8-tep8.yaml     | 119 -----------------
 3 files changed, 2 insertions(+), 280 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a5b4087a8..97ebc9c67 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7530,8 +7530,7 @@ dsv4-fp4-gb200-dynamo-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    # Interactivity (DP-decode): 1 prefill (DP=8) + 4 decodes (DP=8 each).
-    # 10 nodes. Each decode rank holds full attention replica + 1/8 experts.
+    # Interactivity: 1 prefill (DP=8) + 4 decodes (DP=8 each). 10 nodes.
     # NOTE: conc-list was [4, 8, 16, 32, 64, 128]; 4/16/32 dropped to shorten
     # sweep runtime. Re-add them together with the 8k/1k block below.
     - conc-list: [8, 64, 128]
@@ -7547,24 +7546,6 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 8
         ep: 8
         dp-attn: true
-    # Interactivity (TEP-decode): 1 prefill (DP=8) + 4 decodes (TP=8 each).
-    # 10 nodes — same node count as the dep8-dep8 sibling. Each decode rank
-    # holds 1/8 of attention (TP-sharded) + 1/8 of experts (EP), trading
-    # weight-memory headroom for an inter-rank TP all-reduce per attention
-    # layer. Same conc-list as the dep8 entry so they're directly comparable.
-    - conc-list: [8, 64, 128]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml"
-      decode:
-        num-worker: 4
-        tp: 8
-        ep: 1
-        dp-attn: false
     # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
     # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
     - conc-list: [256, 512, 1024, 2048, 3072, 4096]
@@ -7606,7 +7587,7 @@ dsv4-fp4-gb200-dynamo-vllm:
   # - isl: 8192
   #   osl: 1024
   #   search-space:
-  #   # Interactivity (DP-decode): 1 prefill (DP=8) + 4 decodes (DP=8 each).
+  #   # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each).
   #   # 10 nodes total. Low TTFT/TPOT focus.
   #   # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped.
   #   - conc-list: [8, 128]
@@ -7622,21 +7603,6 @@ dsv4-fp4-gb200-dynamo-vllm:
   #       tp: 8
   #       ep: 8
   #       dp-attn: true
-  #   # Interactivity (TEP-decode): 1 prefill (DP=8) + 4 decodes (TP=8 each).
-  #   # 10 nodes. Same conc-list as the dep8 sibling for direct A/B.
-  #   - conc-list: [8, 128]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml"
-  #     decode:
-  #       num-worker: 4
-  #       tp: 8
-  #       ep: 1
-  #       dp-attn: false
   #   # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
   #   - conc-list: [512, 1024]
   #     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml
deleted file mode 100644
index 049d6d55f..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml
+++ /dev/null
@@ -1,125 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tep8"
-
-# 1k/1k TEP variant for low concurrency (4-128).
-#
-# Decode workers use tensor parallelism (TP=8) within each worker instead
-# of data parallelism. Each rank holds 1/8 of attention/embedding (sharded)
-# plus 1/8 of experts (EP) — vs the dep8 variant where each rank holds the
-# full attention replica plus 1/8 of experts. TEP frees ~80-160 GB per rank
-# of weight memory at the cost of an inter-rank TP all-reduce on every
-# attention layer. At low conc (where attention all-reduce overhead is a
-# smaller fraction of step time), this can be a net win on TTFT/TPOT.
-#
-# Topology: 1 prefill (DP=8) + 4 decode (TP=8 each). 10 nodes. Same node
-# count as 1p4d-dep8-dep8, different memory split.
-#
-# Extrapolated from kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml — the
-# only vLLM disagg TEP precedent on GB200 in upstream srt-slurm. Scaled
-# from kimi's TP=4 to TP=8 because DSV4-Pro is too large to TP-shard at 4.
-# No upstream NVIDIA reference for DSV4-Pro TEP yet.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 8
-  prefill_workers: 1
-  decode_workers: 4
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 3072
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      # TP=8 across 8 GPUs (one node per worker). No data-parallel-size.
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 3072
-      # 4 decode workers x 128 = 512 total simultaneous slots, well above
-      # max conc=128 in this entry. KV is TP=8-sharded so per-rank KV is
-      # 1/8 the dep8 case; we can afford the larger max-num-seqs.
-      max-num-seqs: 128
-      max-cudagraph-capture-size: 128
-      max-num-batched-tokens: 128
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x16x64x128"
-  req_rate: "inf"
-  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml
deleted file mode 100644
index 6dee55304..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml
+++ /dev/null
@@ -1,119 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tep8"
-
-# 8k/1k TEP variant for low concurrency (4-128).
-#
-# See ../1k1k/disagg-gb200-1p4d-dep8-tep8.yaml for the rationale (TP=8
-# decode workers shed attention-layer memory pressure vs the dep8 sibling
-# at the cost of TP all-reduce latency).
-#
-# Differences from the 1k/1k version:
-#   * max-model-len: auto (matches NVIDIA PR #67-style 8k/1k recipes)
-#   * prefill max-num-seqs: 2 (NVIDIA's value — 8k inputs fill the 16384
-#     max-num-batched-tokens budget at 2 prefills/batch)
-#   * decode max-num-seqs: 64 (KV is 8x larger per request than 1k/1k;
-#     even with TP=8 sharding, conservative max-num-seqs vs the 128 in
-#     the 1k/1k sibling. 4 workers x 64 = 256 simultaneous, plenty for
-#     the conc=128 max in this entry.)
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 8
-  prefill_workers: 1
-  decode_workers: 4
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      # TP=8 across 8 GPUs (one node per worker). No data-parallel-size.
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: auto
-      max-num-seqs: 64
-      max-cudagraph-capture-size: 64
-      max-num-batched-tokens: 64
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x16x64x128"
-  req_rate: "inf"
-  use_chat_template: false

From 7c8b85919bb2074726a3d31cae83f4f1c4b56373 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 18:12:40 -0700
Subject: [PATCH 23/28] Adopt NVIDIA aflowers/gb200-dsv4-recipes 1p1d-dep8-tep8
 for low conc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors the NVIDIA-official TEP recipe for very low concurrency:

  https://github.com/NVIDIA/srt-slurm/blob/aflowers/gb200-dsv4-recipes/
    recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml

Topology: 1 prefill (DP=8) + 1 decode (TP=8) — 4 nodes. Adds 1k/1k
sibling (no upstream equivalent) by shrinking max-model-len to 3072.

Local deviations from upstream (documented in recipe headers):
  * model.path renamed deepseekv4-fp4 -> deepseek-v4-pro to match our
    launch script's SRT_SLURM_MODEL_PREFIX.
  * Stripped CPU/DRAM offload knobs and numa-bind (our pinned
    NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the
    vllm_numa_bind_hash_fix.py patch upstream uses).
  * benchmark.use_chat_template: false (no PR #68 sa-bench changes in
    our srtctl); benchmark.tokenizer_mode dropped for the same reason.
  * Container kept on the floating tag; health_check + slurm.time_limit
    added for cold-cache Lustre loads.

Replaces the 1p4d-dep8-dep8 low-conc entries (10-node, 4 decode workers)
with this 4-node TEP topology in both 1k/1k (active) and 8k/1k (still
commented). Deletes the now-unused 1p4d-dep8-dep8 recipe files.

Active 1k/1k sweep: 3 entries / 14 benchmark points.
---
 .github/configs/nvidia-master.yaml            |  31 ++--
 ....yaml => disagg-gb200-1p1d-dep8-tep8.yaml} |  72 +++++----
 .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml     | 149 ++++++++++++++++++
 .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml     | 112 -------------
 4 files changed, 206 insertions(+), 158 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/{disagg-gb200-1p4d-dep8-dep8.yaml => disagg-gb200-1p1d-dep8-tep8.yaml} (54%)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 97ebc9c67..91b771a67 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7530,22 +7530,22 @@ dsv4-fp4-gb200-dynamo-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    # Interactivity: 1 prefill (DP=8) + 4 decodes (DP=8 each). 10 nodes.
-    # NOTE: conc-list was [4, 8, 16, 32, 64, 128]; 4/16/32 dropped to shorten
-    # sweep runtime. Re-add them together with the 8k/1k block below.
-    - conc-list: [8, 64, 128]
+    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
+    # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch
+    # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header).
+    - conc-list: [1, 4, 8, 16, 32, 64]
       prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
       decode:
-        num-worker: 4
+        num-worker: 1
         tp: 8
-        ep: 8
-        dp-attn: true
+        ep: 1
+        dp-attn: false
     # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
     # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
     - conc-list: [256, 512, 1024, 2048, 3072, 4096]
@@ -7587,22 +7587,21 @@ dsv4-fp4-gb200-dynamo-vllm:
   # - isl: 8192
   #   osl: 1024
   #   search-space:
-  #   # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each).
-  #   # 10 nodes total. Low TTFT/TPOT focus.
-  #   # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped.
-  #   - conc-list: [8, 128]
+  #   # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
+  #   # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
+  #   - conc-list: [1, 4, 8, 16, 32, 64]
   #     prefill:
   #       num-worker: 1
   #       tp: 8
   #       ep: 8
   #       dp-attn: true
   #       additional-settings:
-  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml"
+  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
   #     decode:
-  #       num-worker: 4
+  #       num-worker: 1
   #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
+  #       ep: 1
+  #       dp-attn: false
   #   # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
   #   - conc-list: [512, 1024]
   #     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
similarity index 54%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 59427712c..c25de42a0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -1,16 +1,15 @@
-name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8"
+name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8"
 
-# 1k/1k interactivity variant of the 8k/1k recipe with the same name (under
-# ../8k1k/). Extrapolated from kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml
-# adjusted for DSV4-Pro's DP>=8 minimum (kimi uses TP=4, we use DP=8 per
-# worker since model layers don't fit at smaller GPU counts).
+# 1k/1k variant of NVIDIA's 8k/1k 1p1d-dep8-tep8 recipe (mirrored from
+# aflowers/gb200-dsv4-recipes branch). Same topology and tuning; only
+# max-model-len shrinks from 9280 (8k+1k+pad) to 3072 (1k+1k+pad). No
+# upstream NVIDIA reference for DSV4-Pro 1k/1k vLLM disagg yet.
 #
-# Differences from our 8k1k 1p4d-dep8-dep8:
-#   * max-model-len: 3072 (1024 + 1024 + 1024 headroom) instead of auto/10240
-#   * prefill max-num-seqs: 16 instead of 2 (1k prompts fit 16/batch within
-#     the same 16384 max-num-batched-tokens budget)
-#   * decode max-num-seqs: 128 instead of 32 (shorter KV = more headroom)
-#   * max-cudagraph-capture-size / max-num-batched-tokens (decode): 128
+# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
+# very low concurrency (1-64).
+#
+# Local deltas vs upstream 8k/1k sibling: same as the 8k/1k recipe — see
+# ../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full deviation list.
 
 model:
   path: "deepseek-v4-pro"
@@ -23,16 +22,9 @@ dynamo:
 
 setup_script: vllm-container-deps.sh
 
-# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so
-# a slow first-time Lustre load + cudagraph capture can't get cut off by the
-# SLURM wall clock.
 slurm:
   time_limit: "8:00:00"
 
-# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from
-# Lustre with multiple workers contending for the same OSTs — previous 1k/1k
-# run hit the default 1800s. Make this *very* generous since the cost of an
-# over-long deadline is just sitting idle, not wasted compute.
 health_check:
   max_attempts: 1440
   interval_seconds: 10
@@ -41,9 +33,9 @@ resources:
   gpu_type: "gb200"
   gpus_per_node: 4
   prefill_nodes: 2
-  decode_nodes: 8
+  decode_nodes: 2
   prefill_workers: 1
-  decode_workers: 4
+  decode_workers: 1
   gpus_per_prefill: 8
   gpus_per_decode: 8
 
@@ -56,20 +48,38 @@ backend:
   connector: null
 
   prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
 
   decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     TILELANG_CLEANUP_TEMP_FILES: "1"
     VLLM_USE_NCCL_SYMM_MEM: "1"
     NCCL_CUMEM_ENABLE: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
 
   vllm_config:
     prefill:
@@ -84,41 +94,43 @@ backend:
       enforce-eager: true
       max-model-len: 3072
       max-num-seqs: 16
-      max-num-batched-tokens: 16384
+      max-num-batched-tokens: 32768
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
       block-size: 256
-      gpu-memory-utilization: 0.88
+      gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
       kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
+      tensor-parallel-size: 8
       pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 3072
-      max-num-seqs: 128
-      max-cudagraph-capture-size: 128
-      max-num-batched-tokens: 128
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
       trust-remote-code: true
       no-enable-prefix-caching: true
       block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
       gpu-memory-utilization: 0.9
       stream-interval: 50
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "8x32x64x128"
+  concurrencies: "1x4x8x16x32x64"
   req_rate: "inf"
   use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
new file mode 100644
index 000000000..1cf645e52
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -0,0 +1,149 @@
+name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8"
+
+# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch:
+#   recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+#
+# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
+# very low concurrency (1-64) where TEP-style decode (TP-sharded
+# attention + EP'd experts within one worker) gives the best per-user
+# latency.
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     our launch script's SRT_SLURM_MODEL_PREFIX.
+#   * CPU/DRAM offload knobs (offload-group-size / -num-in-group /
+#     -prefetch-step / # offload-params) and numa-bind dropped — our
+#     clone is NVIDIA/srt-slurm@sa-submission-q2-2026 which doesn't ship
+#     the vllm_numa_bind_hash_fix.py patch.
+#   * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode
+#     dropped. Both require PR #68 sa-bench tokenizer support that our
+#     pinned srtctl version doesn't have. The recipe-level
+#     `tokenizer-mode: deepseek_v4` for workers stays.
+#   * Container kept on the floating tag (`:deepseekv4-cu130`) instead of
+#     the upstream sha256 pin.
+#   * health_check / slurm.time_limit added — we observed cold-cache
+#     Lustre loads exceeding the default 1800s deadline.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9280
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9280
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
deleted file mode 100644
index 0b000b8e3..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml
+++ /dev/null
@@ -1,112 +0,0 @@
-name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8"
-
-# Interactivity-focused topology: 1 prefill worker + 4 separate decode
-# workers, each at DP=8. Targets conc 4-128 where TTFT/TPOT matter more
-# than aggregate throughput. Same per-worker vllm_config as the NVIDIA
-# 7p1d reference (PR #67); only resources, decode capacity (max-num-seqs
-# / cudagraph capture / batched-tokens), and benchmark concurrencies
-# differ.
-
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:deepseekv4-cu130"
-  precision: "fp4"
-
-dynamo:
-  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
-  install: true
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 8
-  prefill_workers: 1
-  decode_workers: 4
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  decode_environment:
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: auto
-      max-num-seqs: 2
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      gpu-memory-utilization: 0.88
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: auto
-      max-num-seqs: 32
-      max-cudagraph-capture-size: 32
-      max-num-batched-tokens: 32
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x32x64"
-  req_rate: "inf"
-  use_chat_template: false

From 42d9107fcb6c091ec58096b21f3893e30e1755db Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 18:33:30 -0700
Subject: [PATCH 24/28] Re-add CPU/DRAM offload to 1p1d-dep8-tep8 recipes
 (load-bearing)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Last run failed with "Available KV cache memory: -15.99 GiB" on every
prefill rank — model weights + activations alone exceed the
gpu-memory-utilization=0.8 budget by ~16 GB at DP=8 (full attention
replicated per rank + 1/8 of FP4 experts). The upstream recipe ships
with offload precisely to free that ~16 GB by spilling MoE expert
weights to host DRAM.

Restores the three offload knobs on prefill in both 1k/1k and 8k/1k:
  offload-group-size: 3
  offload-num-in-group: 1
  offload-prefetch-step: 2

numa-bind: true is still excluded — needs the
configs/patches/vllm_numa_bind_hash_fix.py patch that our pinned
NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship. Offload
works without it (just slower host-side bandwidth).
---
 .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml        |  8 ++++++++
 .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml        | 16 ++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index c25de42a0..984c79526 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -103,6 +103,14 @@ backend:
       gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
+      # CPU/DRAM expert offload — required for fit. Without these the prefill
+      # rank reports `Available KV cache memory: -16 GiB` and the engine
+      # refuses to start. Numa-bind from upstream is still off because our
+      # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the
+      # vllm_numa_bind_hash_fix.py patch.
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
       tokenizer-mode: deepseek_v4
 
     decode:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
index 1cf645e52..0c872e9c4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -11,10 +11,10 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8"
 # Local deltas vs upstream:
 #   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
 #     our launch script's SRT_SLURM_MODEL_PREFIX.
-#   * CPU/DRAM offload knobs (offload-group-size / -num-in-group /
-#     -prefetch-step / # offload-params) and numa-bind dropped — our
-#     clone is NVIDIA/srt-slurm@sa-submission-q2-2026 which doesn't ship
-#     the vllm_numa_bind_hash_fix.py patch.
+#   * numa-bind dropped — our clone is NVIDIA/srt-slurm@sa-submission-q2-2026
+#     which doesn't ship the vllm_numa_bind_hash_fix.py patch. CPU/DRAM
+#     expert offload (offload-group-size/-num-in-group/-prefetch-step) is
+#     KEPT — it's load-bearing here, see the comment in vllm_config.prefill.
 #   * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode
 #     dropped. Both require PR #68 sa-bench tokenizer support that our
 #     pinned srtctl version doesn't have. The recipe-level
@@ -116,6 +116,14 @@ backend:
       gpu-memory-utilization: 0.8
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
+      # CPU/DRAM expert offload — required for fit. Without these the prefill
+      # rank reports `Available KV cache memory: -16 GiB` and the engine
+      # refuses to start. Numa-bind from upstream is still off because our
+      # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the
+      # vllm_numa_bind_hash_fix.py patch.
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
       tokenizer-mode: deepseek_v4
 
     decode:

From 47d3cdc6b52df99b9963ffefe1ac7ac11fae3b49 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 19:34:41 -0700
Subject: [PATCH 25/28] PR review fixes: harden cp -rT, refresh stale changelog
 description
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* runners/launch_gb200-nv.sh: switch the recipe overlay step from
  `cp -r src dst` to `cp -rT src dst` (with explicit `mkdir -p dst`
  first). Addresses the bot review nit at line 144 — `cp -r src dst`
  works only because the upstream sa-submission-q2-2026 branch has no
  `recipes/vllm/deepseek-v4/` directory today; if upstream ever ships
  one, `cp -r` would nest as `recipes/vllm/deepseek-v4/deepseek-v4/...`
  and CONFIG_FILE in nvidia-master.yaml would silently resolve to the
  upstream stub. `-T` overlays unconditionally.

* perf-changelog.yaml: refresh the dsv4-fp4-gb200-dynamo-vllm entry's
  description. The previous wording referenced "8k1k, 7p1d-dep8-dep16"
  and "Mirrors NVIDIA/srt-slurm PR #67" which is stale after the move
  to a 1k/1k sweep with TEP low-conc (mirrored from PR #71) plus two
  hand-rolled mid/high topologies. Also fixes the directory reference
  (recipes moved to benchmarks/multi_node/srt-slurm-recipes/ during
  the cleanup pass).
---
 perf-changelog.yaml        | 7 ++++---
 runners/launch_gb200-nv.sh | 7 ++++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d1f83e721..453488420 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1779,7 +1779,8 @@
 - config-keys:
     - dsv4-fp4-gb200-dynamo-vllm
   description:
-    - "Add DeepSeek V4 Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 7p1d-dep8-dep16)"
-    - "Container: vllm/vllm-openai:deepseekv4-cu130"
-    - "Mirrors NVIDIA/srt-slurm PR #67; recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime"
+    - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (1k/1k sweep; 8k/1k currently commented out)"
+    - "Container: vllm/vllm-openai:deepseekv4-cu130; model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)"
+    - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern"
+    - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 45d49c09b..224c3a928 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -144,7 +144,12 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q2-2026
-    cp -r "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
+    # Use `cp -rT` so if the upstream branch ever ships a stub
+    # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto
+    # it rather than nesting (`cp -r src dst` would create
+    # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case).
+    mkdir -p recipes/vllm/deepseek-v4
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"

From 9cd8f7070f52b8a3467fc5e8b63350767e1f7286 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 19:35:17 -0700
Subject: [PATCH 26/28] activate 8k1k

---
 .github/configs/nvidia-master.yaml | 102 ++++++++++++++---------------
 1 file changed, 48 insertions(+), 54 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 91b771a67..33563fe25 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7577,57 +7577,51 @@ dsv4-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
-  # ---------------------------------------------------------------------
-  # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the
-  # sweep-enabled gate while we collect 1k/1k data. Re-enable by
-  # uncommenting (remove the leading "# " on every line of the block
-  # below). The conc-lists already have 4/16/32 stripped — add them back
-  # together with the 1k/1k 1p4d block if you want the full sweep again.
-  # ---------------------------------------------------------------------
-  # - isl: 8192
-  #   osl: 1024
-  #   search-space:
-  #   # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
-  #   # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
-  #   - conc-list: [1, 4, 8, 16, 32, 64]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 1
-  #       dp-attn: false
-  #   # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
-  #   - conc-list: [512, 1024]
-  #     prefill:
-  #       num-worker: 3
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 16
-  #       ep: 16
-  #       dp-attn: true
-  #   # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
-  #   # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
-  #   - conc-list: [2048, 4096]
-  #     prefill:
-  #       num-worker: 7
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 16
-  #       ep: 16
-  #       dp-attn: true
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
+      # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
+      - conc-list: [1, 4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+      # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
+      - conc-list: [512, 1024]
+        prefill:
+          num-worker: 3
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+      # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
+      # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
+      - conc-list: [2048, 4096]
+        prefill:
+          num-worker: 7
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true

From 980b77749c31c33a751f75e80b7a85fd44907f4d Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 19:39:26 -0700
Subject: [PATCH 27/28] Fix 8k/1k seq-len-config indent in nvidia-master.yaml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the 8k/1k block was uncommented, every line landed two spaces too
deep — the block became a child of the 1k/1k entry's search-space list
instead of a sibling under seq-len-configs. process_changelog.py's
pydantic check caught this:

  seq-len-configs.0.search-space.3.prefill: Field required
  seq-len-configs.0.search-space.3.isl: Extra inputs are not permitted

(The validator was reading the 8k/1k entry as a 4th search-space item
that lacked prefill/decode and had stray isl/osl fields.)

Dedented the entire 8k/1k block by 2 spaces. Schema validates, matrix
expansion produces 6 entries / 24 benchmark points across 1k/1k + 8k/1k.
---
 .github/configs/nvidia-master.yaml | 94 +++++++++++++++---------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 33563fe25..3604e249e 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7578,50 +7578,50 @@ dsv4-fp4-gb200-dynamo-vllm:
         ep: 16
         dp-attn: true
 
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
-      # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
-      - conc-list: [1, 4, 8, 16, 32, 64]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 1
-          dp-attn: false
-      # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
-      - conc-list: [512, 1024]
-        prefill:
-          num-worker: 3
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-      # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
-      # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
-      - conc-list: [2048, 4096]
-        prefill:
-          num-worker: 7
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
+    # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
+    - conc-list: [1, 4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
+    - conc-list: [512, 1024]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
+    # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
+    - conc-list: [2048, 4096]
+      prefill:
+        num-worker: 7
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true

From d1349b2e0807087ecec39400b082fc6bb63f8e95 Mon Sep 17 00:00:00 2001
From: Oseltamivir <bryansg2013@gmail.com>
Date: Fri, 24 Apr 2026 19:43:26 -0700
Subject: [PATCH 28/28] Align matrix conc-lists to recipe concurrencies (recipe
 is source of truth)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The workflow only exports CONFIG_FILE to srtctl and doesn't rewrite the
recipe's benchmark.concurrencies block — so what actually runs is
determined by the recipe, while the matrix conc-list only drives job
naming and result aggregation. When the two disagree the matrix labels
end up wrong (some advertised concs never run; runs land under
mismatched labels).

Two mismatches caught by audit:

  1k/1k 1p1d-dep8-dep16:
    matrix [256, 512, 1024, 2048, 3072, 4096]  ->  [128, 256, 1024, 2048, 4096]
    recipe stays 128x256x1024x2048x4096

  8k/1k 7p1d-dep8-dep16:
    matrix [2048, 4096]  ->  [4096, 8192]
    recipe stays 4096x8192

Picked recipe-side as the source of truth so the recipes stay
self-consistent; matrix labels now reflect what srtctl will actually run.
---
 .github/configs/nvidia-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3604e249e..eab500d25 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7548,7 +7548,7 @@ dsv4-fp4-gb200-dynamo-vllm:
         dp-attn: false
     # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
     # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
-    - conc-list: [256, 512, 1024, 2048, 3072, 4096]
+    - conc-list: [128, 256, 1024, 2048, 4096]
       prefill:
         num-worker: 1
         tp: 8
@@ -7612,7 +7612,7 @@ dsv4-fp4-gb200-dynamo-vllm:
         dp-attn: true
     # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
     # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
-    - conc-list: [2048, 4096]
+    - conc-list: [4096, 8192]
       prefill:
         num-worker: 7
         tp: 8