From 9e93341dddce20df6f098ddd3a41aef1b42fba67 Mon Sep 17 00:00:00 2001
From: Alec Flowers <aflowers@nvidia.com>
Date: Thu, 30 Apr 2026 14:50:36 -0700
Subject: [PATCH 1/3] Add GB200 DSV4 Dynamo vLLM MTP2 recipes

---
 .github/configs/nvidia-master.yaml            |  83 +++++++++
 .../disagg-gb200-high-tpt-megamoe-mtp2.yaml   | 157 +++++++++++++++++
 .../8k1k/disagg-gb200-low-latency-mtp2.yaml   | 145 ++++++++++++++++
 .../disagg-gb200-low-middle-curve-mtp2.yaml   | 154 +++++++++++++++++
 .../disagg-gb200-mid-curve-megamoe-mtp2.yaml  | 159 ++++++++++++++++++
 perf-changelog.yaml                           |   7 +
 6 files changed, 705 insertions(+)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 15df0cc5f..7fe815eaf 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7749,6 +7749,89 @@ dsv4-fp4-gb200-dynamo-vllm:
         ep: 8
         dp-attn: true
 
+# MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM nightly image
+# and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm.
+dsv4-fp4-gb200-dynamo-vllm-mtp2:
+  image: vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # Low latency: 2 prefill (TP=8 each) + 1 decode (TP=8). 7 nodes total
+    # with a dedicated NATS/etcd infra node.
+    - conc-list: [1, 2, 4, 8, 16]
+      spec-decoding: mtp
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+
+    # Low-middle transition: 1 prefill (DEP=8) + 4 decode (TP=8).
+    # 11 nodes total with a dedicated NATS/etcd infra node.
+    - conc-list: [16]
+      spec-decoding: mtp
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml"
+      decode:
+        num-worker: 4
+        tp: 8
+        ep: 1
+        dp-attn: false
+
+    # MegaMOE mid curve: 1 prefill (DEP=8) + 1 decode (DEP=8).
+    # 5 nodes total with a dedicated NATS/etcd infra node.
+    - conc-list: [128]
+      spec-decoding: mtp
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+
+    # MegaMOE high throughput: 2 prefill (DEP=8 each) + 1 decode (DEP=8).
+    # 7 nodes total with a dedicated NATS/etcd infra node.
+    - conc-list: [1024]
+      spec-decoding: mtp
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+
 dsv4-fp4-gb300-dynamo-sglang:
   image: lmsysorg/sglang:deepseek-v4-grace-blackwell
   model: deepseek-ai/DeepSeek-V4-Pro
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml
new file mode 100644
index 000000000..3ae5f52be
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml
@@ -0,0 +1,157 @@
+name: "svf-vllm-disagg-gb200-high-tpt-megamoe-mtp2"
+
+# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch:
+#   recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml
+#
+# Topology: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 7 nodes total with a
+# dedicated NATS/etcd infra node. MegaMOE MTP2 high-throughput point at
+# concurrency 1024 with no CPU/NVMe offload.
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to the vLLM nightly image to match
+#     nvidia-master.yaml image.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enable-ep-weight-filter: true
+      moe-backend: deep_gemm_mega_moe
+      enforce-eager: true
+      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9280
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.94
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enable-ep-weight-filter: true
+      moe-backend: deep_gemm_mega_moe
+      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9280
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 1024
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024"
+  req_rate: "inf"
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  model:
+    repo: "deepseek-ai/DeepSeek-V4-Pro"
+    revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
+  container:
+    image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml
new file mode 100644
index 000000000..fda9903d4
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml
@@ -0,0 +1,145 @@
+name: "svf-vllm-disagg-gb200-low-latency-mtp2"
+
+# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch:
+#   recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-latency-mtp2.yaml
+#
+# Topology: 2 prefill (TP=8 each) + 1 decode (TP=8). 7 nodes total with a
+# dedicated NATS/etcd infra node. MTP2 low-latency points at concurrencies
+# 1/2/4/8/16.
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to the vLLM nightly image to match
+#     nvidia-master.yaml image.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "tp8-prefill-2p1d-mtp2"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9280
+      max-num-seqs: 8
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "tp8-decode-2p1d-mtp2"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9280
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16"
+  req_rate: "inf"
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  model:
+    repo: "deepseek-ai/DeepSeek-V4-Pro"
+    revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
+  container:
+    image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml
new file mode 100644
index 000000000..0fc0f5f4b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml
@@ -0,0 +1,154 @@
+name: "svf-vllm-disagg-gb200-low-middle-curve-mtp2"
+
+# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch:
+#   recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml
+#
+# Topology: 1 prefill (DEP=8) + 4 decode (TP=8). 11 nodes total with a
+# dedicated NATS/etcd infra node. MTP2 low-middle transition point at
+# concurrency 16.
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to the vLLM nightly image to match
+#     nvidia-master.yaml image.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-hybrid-lb: true
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9280
+      max-num-seqs: 8
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9280
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "16"
+  req_rate: "inf"
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  model:
+    repo: "deepseek-ai/DeepSeek-V4-Pro"
+    revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
+  container:
+    image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml
new file mode 100644
index 000000000..41cb7e1ef
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml
@@ -0,0 +1,159 @@
+name: "svf-vllm-disagg-gb200-mid-curve-megamoe-mtp2"
+
+# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch:
+#   recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml
+#
+# Topology: 1 prefill (DEP=8) + 1 decode (DEP=8). 5 nodes total with a
+# dedicated NATS/etcd infra node. MegaMOE MTP2 mid-curve point at
+# concurrency 128 with no CPU/NVMe offload.
+#
+# Local deltas vs upstream:
+#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
+#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
+#   * model.container set to the vLLM nightly image to match
+#     nvidia-master.yaml image.
+#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
+#     absorb cold-cache /mnt/numa1 model loads.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-hybrid-lb: true
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enable-ep-weight-filter: true
+      moe-backend: deep_gemm_mega_moe
+      enforce-eager: true
+      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9280
+      max-num-seqs: 8
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-hybrid-lb: true
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enable-ep-weight-filter: true
+      moe-backend: deep_gemm_mega_moe
+      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      max-model-len: 9280
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128"
+  req_rate: "inf"
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  model:
+    repo: "deepseek-ai/DeepSeek-V4-Pro"
+    revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
+  container:
+    image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 0e13cb570..1753cc1d5 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2056,3 +2056,10 @@
     - "Add --gpu-memory-utilization 0.9 to server launch"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1126
   
+- config-keys:
+    - dsv4-fp4-gb200-dynamo-vllm-mtp2
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB200 Dynamo vLLM MTP2 Pareto recipes using vLLM nightly image"
+    - "Recipes cover 8k/1k low-latency 2P/1D TP8 conc=1/2/4/8/16, low-middle 1P/4D DEP8/TP8 conc=16, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024"
+    - "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242

From cd137a1573dbe68d09d90a67ba0c09c887e227d2 Mon Sep 17 00:00:00 2001
From: Alec Flowers <aflowers@nvidia.com>
Date: Thu, 30 Apr 2026 18:44:57 -0700
Subject: [PATCH 2/3] Update GB200 DSV4 MTP2 Pareto recipes

---
 .github/configs/nvidia-master.yaml            |  18 +-
 .../8k1k/agg-gb200-low-latency-mtp2.yaml      |  86 ++++++++++
 .../disagg-gb200-high-tpt-megamoe-mtp2.yaml   |  20 +--
 .../8k1k/disagg-gb200-low-latency-mtp2.yaml   | 112 ++++++-------
 .../disagg-gb200-low-middle-curve-mtp2.yaml   | 154 ------------------
 .../disagg-gb200-mid-curve-megamoe-mtp2.yaml  |  14 --
 6 files changed, 147 insertions(+), 257 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 7fe815eaf..56521e394 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7764,26 +7764,24 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
   - isl: 8192
     osl: 1024
     search-space:
-    # Low latency: 2 prefill (TP=8 each) + 1 decode (TP=8). 7 nodes total
-    # with a dedicated NATS/etcd infra node.
-    - conc-list: [1, 2, 4, 8, 16]
+    # Aggregate low latency: TP=8, max-num-seqs=4.
+    - conc-list: [1]
       spec-decoding: mtp
       prefill:
-        num-worker: 2
+        num-worker: 1
         tp: 8
         ep: 1
         dp-attn: false
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml"
       decode:
-        num-worker: 1
+        num-worker: 0
         tp: 8
         ep: 1
         dp-attn: false
 
-    # Low-middle transition: 1 prefill (DEP=8) + 4 decode (TP=8).
-    # 11 nodes total with a dedicated NATS/etcd infra node.
-    - conc-list: [16]
+    # Low-latency bridge: 1 prefill (DEP=8) + 4 decode (TP=8), no offload.
+    - conc-list: [16, 32, 64]
       spec-decoding: mtp
       prefill:
         num-worker: 1
@@ -7791,7 +7789,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
         ep: 8
         dp-attn: true
         additional-settings:
-        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml"
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml"
       decode:
         num-worker: 4
         tp: 8
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml
new file mode 100644
index 000000000..a5c3877d1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml
@@ -0,0 +1,86 @@
+name: "svf-vllm-agg-gb200-low-latency-mtp2"
+
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
+      compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      tokenizer-mode: deepseek_v4
+      max-model-len: 9280
+      max-num-seqs: 4
+      max-num-batched-tokens: 8192
+      max-cudagraph-capture-size: 4
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1"
+  req_rate: "inf"
+  use_chat_template: true
+  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+
+identity:
+  model:
+    repo: "deepseek-ai/DeepSeek-V4-Pro"
+    revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
+  container:
+    image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
+  frameworks:
+    dynamo: "1.2.0.dev20260426"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml
index 3ae5f52be..1464135e5 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml
@@ -1,19 +1,5 @@
 name: "svf-vllm-disagg-gb200-high-tpt-megamoe-mtp2"
 
-# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch:
-#   recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml
-#
-# Topology: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 7 nodes total with a
-# dedicated NATS/etcd infra node. MegaMOE MTP2 high-throughput point at
-# concurrency 1024 with no CPU/NVMe offload.
-#
-# Local deltas vs upstream:
-#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
-#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
-#   * model.container set to the vLLM nightly image to match
-#     nvidia-master.yaml image.
-#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
-#     absorb cold-cache /mnt/numa1 model loads.
 model:
   path: "deepseek-v4-pro"
   container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
@@ -48,7 +34,6 @@ infra:
 frontend:
   type: dynamo
   enable_multiple_frontends: false
-
 backend:
   type: vllm
   connector: null
@@ -63,6 +48,8 @@ backend:
     VLLM_SERVER_DEV_MODE: "1"
     VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
@@ -77,6 +64,8 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_NVLS_ENABLE: "1"
     VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
     UCX_TLS: "cuda_copy,cuda_ipc,tcp"
@@ -137,7 +126,6 @@ backend:
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
       tokenizer-mode: deepseek_v4
-
 benchmark:
   type: "sa-bench"
   isl: 8192
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml
index fda9903d4..016f03755 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml
@@ -1,19 +1,5 @@
-name: "svf-vllm-disagg-gb200-low-latency-mtp2"
+name: svf-vllm-disagg-gb200-low-latency-mtp2
 
-# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch:
-#   recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-latency-mtp2.yaml
-#
-# Topology: 2 prefill (TP=8 each) + 1 decode (TP=8). 7 nodes total with a
-# dedicated NATS/etcd infra node. MTP2 low-latency points at concurrencies
-# 1/2/4/8/16.
-#
-# Local deltas vs upstream:
-#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
-#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
-#   * model.container set to the vLLM nightly image to match
-#     nvidia-master.yaml image.
-#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
-#     absorb cold-cache /mnt/numa1 model loads.
 model:
   path: "deepseek-v4-pro"
   container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
@@ -33,66 +19,67 @@ health_check:
   interval_seconds: 10
 
 resources:
-  gpu_type: "gb200"
+  gpu_type: gb200
   gpus_per_node: 4
-  prefill_nodes: 4
-  decode_nodes: 2
-  prefill_workers: 2
-  decode_workers: 1
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
   gpus_per_prefill: 8
   gpus_per_decode: 8
-
 infra:
   etcd_nats_dedicated_node: true
-
 frontend:
   type: dynamo
   enable_multiple_frontends: false
-
 backend:
   type: vllm
   connector: null
   prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
-    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    VLLM_ENGINE_READY_TIMEOUT_S: '3600'
+    TILELANG_CLEANUP_TEMP_FILES: '1'
+    VLLM_USE_NCCL_SYMM_MEM: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_NVLS_ENABLE: '1'
+    VLLM_SERVER_DEV_MODE: '1'
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: '1024'
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: '2048'
+    UCX_MEMTYPE_CACHE: n
+    UCX_MEMTYPE_REG_WHOLE: n
+    UCX_TLS: cuda_copy,cuda_ipc,tcp
+    UCX_CUDA_IPC_ENABLE_MNNVL: y
     NCCL_P2P_LEVEL: NVL
   decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    VLLM_ENGINE_READY_TIMEOUT_S: '3600'
+    TILELANG_CLEANUP_TEMP_FILES: '1'
+    VLLM_USE_NCCL_SYMM_MEM: '1'
+    NCCL_CUMEM_ENABLE: '1'
+    NCCL_MNNVL_ENABLE: '1'
+    NCCL_NVLS_ENABLE: '1'
+    VLLM_SERVER_DEV_MODE: '1'
+    UCX_MEMTYPE_CACHE: n
+    UCX_MEMTYPE_REG_WHOLE: n
+    UCX_TLS: cuda_copy,cuda_ipc,tcp
+    UCX_CUDA_IPC_ENABLE_MNNVL: y
     NCCL_P2P_LEVEL: NVL
   vllm_config:
     prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "tp8-prefill-2p1d-mtp2"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 8
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: deepseek-ai/DeepSeek-V4-Pro
+      kv-cache-dtype: fp8
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-hybrid-lb: true
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
       enforce-eager: true
       speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
       attention-config: '{"use_fp4_indexer_cache":true}'
       max-model-len: 9280
       max-num-seqs: 8
-      max-num-batched-tokens: 32768
+      max-num-batched-tokens: 16384
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
@@ -104,17 +91,17 @@ backend:
       numa-bind: true
       tokenizer-mode: deepseek_v4
     decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "tp8-decode-2p1d-mtp2"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: deepseek-ai/DeepSeek-V4-Pro
+      kv-cache-dtype: fp8
       tensor-parallel-size: 8
       pipeline-parallel-size: 1
       speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
       attention-config: '{"use_fp4_indexer_cache":true}'
       max-model-len: 9280
-      max-num-seqs: 64
-      max-cudagraph-capture-size: 64
-      max-num-batched-tokens: 64
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
       trust-remote-code: true
       no-enable-prefix-caching: true
       no-enable-flashinfer-autotune: true
@@ -125,15 +112,14 @@ backend:
       no-disable-hybrid-kv-cache-manager: true
       enable-sleep-mode: true
       tokenizer-mode: deepseek_v4
-
 benchmark:
-  type: "sa-bench"
+  type: sa-bench
   isl: 8192
   osl: 1024
-  concurrencies: "1x2x4x8x16"
-  req_rate: "inf"
+  concurrencies: 16x32x64
+  req_rate: inf
   use_chat_template: true
-  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
+  custom_tokenizer: sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer
 
 identity:
   model:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml
deleted file mode 100644
index 0fc0f5f4b..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml
+++ /dev/null
@@ -1,154 +0,0 @@
-name: "svf-vllm-disagg-gb200-low-middle-curve-mtp2"
-
-# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch:
-#   recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml
-#
-# Topology: 1 prefill (DEP=8) + 4 decode (TP=8). 11 nodes total with a
-# dedicated NATS/etcd infra node. MTP2 low-middle transition point at
-# concurrency 16.
-#
-# Local deltas vs upstream:
-#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
-#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
-#   * model.container set to the vLLM nightly image to match
-#     nvidia-master.yaml image.
-#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
-#     absorb cold-cache /mnt/numa1 model loads.
-model:
-  path: "deepseek-v4-pro"
-  container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
-  precision: "fp4"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260426"
-
-setup_script: vllm-container-deps.sh
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 1440
-  interval_seconds: 10
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 8
-  prefill_workers: 1
-  decode_workers: 4
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-infra:
-  etcd_nats_dedicated_node: true
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    TORCH_SYMMMEM: "NVSHMEM"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
-    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_P2P_LEVEL: NVL
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    TILELANG_CLEANUP_TEMP_FILES: "1"
-    VLLM_USE_NCCL_SYMM_MEM: "1"
-    TORCH_SYMMMEM: "NVSHMEM"
-    NCCL_CUMEM_ENABLE: "1"
-    NCCL_MNNVL_ENABLE: "1"
-    NCCL_NVLS_ENABLE: "1"
-    VLLM_SERVER_DEV_MODE: "1"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_P2P_LEVEL: NVL
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-hybrid-lb: true
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      enforce-eager: true
-      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
-      attention-config: '{"use_fp4_indexer_cache":true}'
-      max-model-len: 9280
-      max-num-seqs: 8
-      max-num-batched-tokens: 16384
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      no-async-scheduling: true
-      block-size: 256
-      gpu-memory-utilization: 0.9
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      numa-bind: true
-      offload-group-size: 3
-      offload-num-in-group: 1
-      offload-prefetch-step: 2
-      tokenizer-mode: deepseek_v4
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
-      kv-cache-dtype: "fp8"
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
-      attention-config: '{"use_fp4_indexer_cache":true}'
-      max-model-len: 9280
-      max-num-seqs: 256
-      max-cudagraph-capture-size: 256
-      max-num-batched-tokens: 256
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      no-enable-flashinfer-autotune: true
-      block-size: 256
-      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
-      gpu-memory-utilization: 0.9
-      stream-interval: 50
-      no-disable-hybrid-kv-cache-manager: true
-      enable-sleep-mode: true
-      tokenizer-mode: deepseek_v4
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "16"
-  req_rate: "inf"
-  use_chat_template: true
-  custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"
-
-identity:
-  model:
-    repo: "deepseek-ai/DeepSeek-V4-Pro"
-    revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
-  container:
-    image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
-  frameworks:
-    dynamo: "1.2.0.dev20260426"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml
index 41cb7e1ef..001101525 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml
@@ -1,19 +1,5 @@
 name: "svf-vllm-disagg-gb200-mid-curve-megamoe-mtp2"
 
-# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch:
-#   recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml
-#
-# Topology: 1 prefill (DEP=8) + 1 decode (DEP=8). 5 nodes total with a
-# dedicated NATS/etcd infra node. MegaMOE MTP2 mid-curve point at
-# concurrency 128 with no CPU/NVMe offload.
-#
-# Local deltas vs upstream:
-#   * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
-#     SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
-#   * model.container set to the vLLM nightly image to match
-#     nvidia-master.yaml image.
-#   * slurm.time_limit + health_check set to 8h / 1440 attempts to
-#     absorb cold-cache /mnt/numa1 model loads.
 model:
   path: "deepseek-v4-pro"
   container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"

From dc7bdf4a1f4a2c82bbfdade7d3b62d9f3f09eebf Mon Sep 17 00:00:00 2001
From: Alec Flowers <aflowers@nvidia.com>
Date: Thu, 30 Apr 2026 18:47:53 -0700
Subject: [PATCH 3/3] Update GB200 MTP2 changelog description

---
 perf-changelog.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1753cc1d5..2dfcda9fe 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2059,7 +2059,7 @@
 - config-keys:
     - dsv4-fp4-gb200-dynamo-vllm-mtp2
   description:
-    - "Add DeepSeek-V4-Pro FP4 GB200 Dynamo vLLM MTP2 Pareto recipes using vLLM nightly image"
-    - "Recipes cover 8k/1k low-latency 2P/1D TP8 conc=1/2/4/8/16, low-middle 1P/4D DEP8/TP8 conc=16, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024"
+    - "Add final DeepSeek-V4-Pro FP4 GB200 Dynamo vLLM MTP2 Pareto recipes using vLLM nightly image"
+    - "Recipes cover 8k/1k aggregate TP8 low-latency conc=1, low-latency bridge 1P DEP8 + 4D TP8 no-offload conc=16/32/64, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024"
     - "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242