diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index cccde0bcc..1d467308f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7541,3 +7541,115 @@ kimik2.5-fp4-gb200-dynamo-vllm:
         tp: 16
         ep: 16
         dp-attn: true
+
+dsv4-fp4-gb200-dynamo-vllm:
+  image: vllm/vllm-openai:deepseekv4-cu130
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  # 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's
+  # DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg
+  # at this seq-len yet (PR #67 only publishes 8k/1k).
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
+    # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch
+    # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header).
+    - conc-list: [1, 4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
+    # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
+    - conc-list: [128, 256, 1024, 2048, 4096]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
+    # The 4096 overlap with the 1p1d block gives a crossover point. 8192
+    # would saturate 1p1d's prefill, so this topology takes over there.
+    - conc-list: [4096, 8192]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
+    # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
+    - conc-list: [1, 4, 8, 16, 32, 64]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
+    - conc-list: [512, 1024]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
+    # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
+    # (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
+    - conc-list: [4096, 8192]
+      prefill:
+        num-worker: 7
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
+      decode:
+        num-worker: 1
+        tp: 16
+        ep: 16
+        dp-attn: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..bf5b441b9
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml
@@ -0,0 +1,125 @@
+name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16"
+
+# 1k/1k mid-to-high throughput topology. Extrapolated from
+# kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml adjusted for DSV4-Pro's
+# DP>=8 minimum.  Single prefill worker feeding a wide DP=16 decode handles
+# conc 256-4096 cleanly for 1k prompts (prefill throughput per rank is high
+# enough at this prompt length; see kimi precedent).
+#
+# Differences from our 8k1k 7p1d-dep8-dep16:
+#   * prefill_workers: 1 (vs 7) — 1k prompts don't need 14 prefill nodes
+#   * max-model-len: 3072 instead of auto
+#   * prefill max-num-seqs: 16 (fills 16384-token budget at 1k per seq)
+#   * decode max-num-seqs: 512 instead of 256 (shorter KV, more parallelism)
+#   * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so
+# a slow first-time Lustre load + cudagraph capture can't get cut off by the
+# SLURM wall clock.
+slurm:
+  time_limit: "8:00:00"
+
+# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from
+# Lustre with multiple workers contending for the same OSTs — previous 1k/1k
+# run hit the default 1800s. Make this *very* generous since the cost of an
+# over-long deadline is just sitting idle, not wasted compute.
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x1024x2048x4096"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
new file mode 100644
index 000000000..984c79526
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -0,0 +1,144 @@
+name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8"
+
+# 1k/1k variant of NVIDIA's 8k/1k 1p1d-dep8-tep8 recipe (mirrored from
+# aflowers/gb200-dsv4-recipes branch). Same topology and tuning; only
+# max-model-len shrinks from 9280 (8k+1k+pad) to 3072 (1k+1k+pad). No
+# upstream NVIDIA reference for DSV4-Pro 1k/1k vLLM disagg yet.
+#
+# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
+# very low concurrency (1-64).
+#
+# Local deltas vs upstream 8k/1k sibling: same as the 8k/1k recipe — see
+# ../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full deviation list.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      # CPU/DRAM expert offload — required for fit. Without these the prefill
+      # rank reports `Available KV cache memory: -16 GiB` and the engine
+      # refuses to start. Numa-bind from upstream is still off because our
+      # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the
+      # vllm_numa_bind_hash_fix.py patch.
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..63e9e280c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -0,0 +1,117 @@
+name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16"
+
+# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single
+# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those
+# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling)
+# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s
+# exceeds what one DP=8 worker can sustain.
+#
+# Decode capacity:
+#   max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which
+#   leaves headroom over the conc=8192 working set (per-rank avg 512).
+#   max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is
+#   ~512 so cudagraphs still apply at steady state.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 3072
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 3072
+      max-num-seqs: 1024
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 1024
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4096x8192"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
new file mode 100644
index 000000000..0c872e9c4
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -0,0 +1,157 @@
+name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8"
+
+# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch:
+#   recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
+#
+# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
+# very low concurrency (1-64) where TEP-style decode (TP-sharded
+# attention + EP'd experts within one worker) gives the best per-user
+# latency.
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     our launch script's SRT_SLURM_MODEL_PREFIX.
+#   * numa-bind dropped — our clone is NVIDIA/srt-slurm@sa-submission-q2-2026
+#     which doesn't ship the vllm_numa_bind_hash_fix.py patch. CPU/DRAM
+#     expert offload (offload-group-size/-num-in-group/-prefetch-step) is
+#     KEPT — it's load-bearing here, see the comment in vllm_config.prefill.
+#   * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode
+#     dropped. Both require PR #68 sa-bench tokenizer support that our
+#     pinned srtctl version doesn't have. The recipe-level
+#     `tokenizer-mode: deepseek_v4` for workers stays.
+#   * Container kept on the floating tag (`:deepseekv4-cu130`) instead of
+#     the upstream sha256 pin.
+#   * health_check / slurm.time_limit added — we observed cold-cache
+#     Lustre loads exceeding the default 1800s deadline.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9280
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      # CPU/DRAM expert offload — required for fit. Without these the prefill
+      # rank reports `Available KV cache memory: -16 GiB` and the engine
+      # refuses to start. Numa-bind from upstream is still off because our
+      # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the
+      # vllm_numa_bind_hash_fix.py patch.
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9280
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..d6b750bf2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml
@@ -0,0 +1,112 @@
+name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16"
+
+# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single
+# wide decode (DP=16). Targets conc 512-1024 where a single big decode
+# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d
+# reference (PR #67); only resources, prefill_workers count, and
+# benchmark concurrencies differ. Decode capacity matches 7p1d
+# (max-num-seqs=256) since the decode topology itself is identical.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
new file mode 100644
index 000000000..6213373b3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml
@@ -0,0 +1,122 @@
+name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16"
+
+# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra
+# benchmark flag: use_chat_template=false. The HF tokenizer for
+# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's
+# --use-chat-template path calls tokenizer.apply_chat_template() and raises
+# ValueError. Throughput benchmarking uses /v1/completions with random tokens
+# anyway — no chat template needed.
+#
+# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a
+# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/
+# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and
+# uses this native formatter — no custom Jinja template required.
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads
+# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor
+# shards with 14 prefill workers contending for the same OSTs. The first
+# bump to 7200s was still insufficient in one case, so pad generously to
+# 14400s (4h). Over-long deadline only costs idle time, not compute.
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 14
+  decode_nodes: 4
+  prefill_workers: 7
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: auto
+      max-num-seqs: 2
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.88
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: auto
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096x8192"
+  req_rate: "inf"
+  use_chat_template: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 54c960524..a6c811748 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1803,3 +1803,12 @@
     - "300 minutes matches the GH Actions outer timeout-minutes cap in benchmark-tmpl.yml"
     - "Retriggering dsv4-fp8-mi355x-sglang"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1148
+
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (1k/1k sweep; 8k/1k currently commented out)"
+    - "Container: vllm/vllm-openai:deepseekv4-cu130; model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)"
+    - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern"
+    - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index b746e4a24..224c3a928 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -42,8 +42,14 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     if [[ $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4"
         export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4"
+    elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+        # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre
+        # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the
+        # model.path alias in our DSV4 recipes.
+        export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/"
+        export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
     else
-        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4"
+        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4"
         exit 1
     fi
 else
@@ -134,7 +140,17 @@ if [ -d "$SRT_REPO_DIR" ]; then
     rm -rf "$SRT_REPO_DIR"
 fi
 
-if [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    git checkout sa-submission-q2-2026
+    # Use `cp -rT` so if the upstream branch ever ships a stub
+    # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto
+    # it rather than nesting (`cp -r src dst` would create
+    # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case).
+    mkdir -p recipes/vllm/deepseek-v4
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
+elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q2-2026