diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 56521e394..8a80b3fb3 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7830,6 +7830,98 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
         ep: 8
         dp-attn: true
 
+dsv4-fp4-gb300-dynamo-vllm:
+  image: vllm/vllm-openai:v0.20.0-ubuntu2404
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb300-nv
+  precision: fp4
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - conc-list: [192]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml"
+      decode:
+        num-worker: 6
+        tp: 4
+        ep: 1
+        dp-attn: false
+    - conc-list: [18]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml"
+      decode:
+        num-worker: 17
+        tp: 4
+        ep: 1
+        dp-attn: false
+    - conc-list: [4096]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    - conc-list: [4096]
+      prefill:
+        num-worker: 5
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    - conc-list: [4096]
+      prefill:
+        num-worker: 6
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+    - conc-list: [3072]
+      prefill:
+        num-worker: 7
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml"
+      decode:
+        num-worker: 2
+        tp: 16
+        ep: 16
+        dp-attn: true
+
 dsv4-fp4-gb300-dynamo-sglang:
   image: lmsysorg/sglang:deepseek-v4-grace-blackwell
   model: deepseek-ai/DeepSeek-V4-Pro
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml
new file mode 100644
index 000000000..a2c3ab80a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml
@@ -0,0 +1,106 @@
+name: "svf-vllm-disagg-gb300-1p17d-tep4-tp4"
+
+# Topology: 1 prefill (TEP=4) + 17 decode (TP=4). 18 GB300 nodes (1P + 17D = 72
+# GPUs at 4 GPUs/node), NATS/etcd colocated on the prefill node.
+# Wide-decode point at concurrency 18 — each decode worker holds a
+# single replica.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 17
+  prefill_workers: 1
+  decode_workers: 17
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      enable-ep-weight-filter: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-enable-flashinfer-autotune: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "18"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml
new file mode 100644
index 000000000..c3b25acc1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml
@@ -0,0 +1,114 @@
+name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4"
+
+# Topology: 1 prefill (DEP=4) + 6 decode (TP=4). 7 GB300 nodes (1P + 6D = 28
+# GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. Low-mid curve
+# point at concurrency 192.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 6
+  prefill_workers: 1
+  decode_workers: 6
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      attention-config: '{"use_fp4_indexer_cache": true}'
+      moe-backend: "deep_gemm_mega_moe"
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      enable-ep-weight-filter: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-enable-flashinfer-autotune: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "192"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml
new file mode 100644
index 000000000..b97ef0d5a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml
@@ -0,0 +1,122 @@
+name: "svf-vllm-disagg-gb300-4p1d-dep4-dep8-24"
+
+# Topology: 4 prefill (DEP=4 each) + 1 decode (DEP=8). 6 GB300 nodes (4P + 2D
+# = 24 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node.
+# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on
+# both workers.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  wheel: "1.2.0.dev20260426"
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      no-async-scheduling: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml
new file mode 100644
index 000000000..d83e6d771
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml
@@ -0,0 +1,122 @@
+name: "svf-vllm-disagg-gb300-5p1d-dep4-dep8-28"
+
+# Topology: 5 prefill (DEP=4 each) + 1 decode (DEP=8). 7 GB300 nodes (5P + 2D
+# = 28 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node.
+# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on
+# both workers.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  wheel: "1.2.0.dev20260426"
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 5
+  decode_nodes: 2
+  prefill_workers: 5
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      no-async-scheduling: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml
new file mode 100644
index 000000000..4b54cc13e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml
@@ -0,0 +1,122 @@
+name: "svf-vllm-disagg-gb300-6p1d-dep4-dep8-32"
+
+# Topology: 6 prefill (DEP=4 each) + 1 decode (DEP=8). 8 GB300 nodes (6P + 2D
+# = 32 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node.
+# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on
+# both workers.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  wheel: "1.2.0.dev20260426"
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 2
+  prefill_workers: 6
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      no-async-scheduling: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml
new file mode 100644
index 000000000..43c2031a8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml
@@ -0,0 +1,119 @@
+name: "svf-vllm-disagg-gb300-7p2d-dep4-dep16"
+
+# Topology: 7 prefill (DEP=4) + 2 decode (DEP=16). 15 GB300 nodes (7P + 8D
+# = 60 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node.
+# Wide-EP decode max-throughput point at concurrency 3072.
+model:
+  path: "deepseek-v4-pro"
+  container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 1440
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 7
+  decode_nodes: 8
+  prefill_workers: 7
+  decode_workers: 2
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+infra:
+  etcd_nats_dedicated_node: true
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      attention-config: '{"use_fp4_indexer_cache": true}'
+      moe-backend: "deep_gemm_mega_moe"
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      enable-ep-weight-filter: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      attention-config: '{"use_fp4_indexer_cache": true}'
+      moe-backend: "deep_gemm_mega_moe"
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-enable-flashinfer-autotune: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "3072"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2dfcda9fe..0403c2385 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2041,6 +2041,12 @@
     - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157
 
+- config-keys:
+    - dsv4-fp4-gb300-dynamo-vllm
+  description:
+    - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated Dynamo vLLM benchmarks at 8k/1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1238
+
 - config-keys:
     - qwen3.5-fp8-b200-sglang
   description:
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index eaac2a1e0..413df8d8e 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -7,12 +7,36 @@
 
 set -x
 
-if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
-    # Weights staged on compute-node-local NVMe at /scratch/models/dsv4/.
-    # The exact upstream recipes refer to this model as `dspro`.
+if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    # Weights staged on compute-node-local NVMe.
     export MODEL_PATH="/scratch/models/dsv4/"
+
+    if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
+        # Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers`
+        # (= NVIDIA/srt-slurm@9d75f82 + sa-bench parallel random prompt
+        # generation). The single-threaded random prompt generator in the
+        # upstream sa-bench dominates bench startup on the 7p1d/conc=8192
+        # sweep (~50 min for the main pass alone before the first HTTP
+        # request leaves the client). The fork bumps that to ~1 min via
+        # multiprocessing.Pool with `--random-num-workers 48`.
+        #
+        # TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR
+        # (https://github.com/NVIDIA/srt-slurm/pull/114) merges.
+        SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git"
+        SRT_SLURM_RECIPES_REF="4249d168208ff5ff1f30b3c1158d893cc0615bb5"
+        SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4"
+        SRT_RECIPE_DST="recipes/sglang/deepseek-v4"
+    elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
+        SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git"
+        SRT_SLURM_RECIPES_REF="aflowers/gb200-dsv4-recipes"
+        SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4"
+        SRT_RECIPE_DST="recipes/vllm/deepseek-v4"
+    else
+        echo "Unsupported framework on gb300-cw for dsv4/fp4: $FRAMEWORK. Currently supported: dynamo-sglang, dynamo-vllm"
+        exit 1
+    fi
 else
-    echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang"
+    echo "Unsupported model prefix/precision combination on gb300-cw: $MODEL_PREFIX/$PRECISION. Currently supported: dsv4/fp4"
     exit 1
 fi
 
@@ -32,18 +56,6 @@ export NVIDIA_VISIBLE_DEVICES=all
 export NVIDIA_DRIVER_CAPABILITIES=compute,utility
 
 NGINX_IMAGE="nginx:1.27.4"
-# Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers`
-# (= NVIDIA/srt-slurm@9d75f82 + sa-bench parallel random prompt
-# generation). The single-threaded random prompt generator in the
-# upstream sa-bench dominates bench startup on the 7p1d/conc=8192
-# sweep (~50 min for the main pass alone before the first HTTP
-# request leaves the client). The fork bumps that to ~1 min via
-# multiprocessing.Pool with `--random-num-workers 48`.
-#
-# TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR
-# (https://github.com/NVIDIA/srt-slurm/pull/114) merges.
-SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git"
-SRT_SLURM_RECIPES_COMMIT="4249d168208ff5ff1f30b3c1158d893cc0615bb5"
 
 # Squash files live alongside models on /mnt/vast (shared across nodes).
 # `squash_dupe` instead of `squash` to use '_'-separated names: srtctl /
@@ -51,33 +63,29 @@ SRT_SLURM_RECIPES_COMMIT="4249d168208ff5ff1f30b3c1158d893cc0615bb5"
 # old /mnt/vast/squash dir contains '+'-separated files from prior runs.
 SQUASH_DIR="/mnt/vast/squash_dupe"
 mkdir -p "$SQUASH_DIR"
-# Compute nodes (slurm-gb300-138-*, slurm-gb300-139-*) are aarch64; the
-# image `lmsysorg/sglang:deepseek-v4-grace-blackwell` is published as
-# arm64-only. The CI runner pod is x86_64 and (a) cannot run
-# `enroot import` for the arm64 manifest because `enroot-aufs2ovlfs`
-# needs CAP_SYS_ADMIN that the pod lacks ("Operation not permitted"),
-# and (b) even with `--arch aarch64` the conversion still fails on x86.
-# Per `https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780`
-# the import has to be dispatched to an arm64 compute node via srun.
-# To keep CI self-contained we instead pin to the pre-staged arm64 sqsh
-# under /mnt/vast/squash_dupe/ (refreshed manually by running that gist
-# script when the docker tag is updated). Filename suffix `_arm64`
-# distinguishes the working arm64 sqsh from any stale amd64 shadow.
 SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh"
 NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh"
 
-if [[ ! -f "$SQUASH_FILE" ]]; then
-    echo "ERROR: pre-staged arm64 sqsh missing: $SQUASH_FILE" >&2
-    echo "Refresh it on a GB300 compute node via the script in the gist:" >&2
-    echo "  https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780" >&2
-    exit 1
-fi
-if [[ ! -f "$NGINX_SQUASH_FILE" ]]; then
-    echo "ERROR: pre-staged arm64 nginx sqsh missing: $NGINX_SQUASH_FILE" >&2
-    echo "Run on an aarch64 host:" >&2
-    echo "  enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" >&2
-    exit 1
-fi
+# Run the import on a compute node via srun, not on the runner pod:
+# the runner pod is x86_64 while the compute nodes are aarch64, so the
+# arm64 squash file has to be built on a compute node.
+import_squash() {
+    local squash="$1" image="$2"
+    local lock="${squash}.lock"
+    srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --exclusive --time=180 bash -c "
+        exec 9>\"$lock\"
+        flock -w 600 9 || { echo 'Failed to acquire lock for $squash' >&2; exit 1; }
+        if unsquashfs -l \"$squash\" > /dev/null 2>&1; then
+            echo 'Squash file already exists and is valid, skipping import: $squash'
+        else
+            rm -f \"$squash\"
+            enroot import -o \"$squash\" docker://$image
+        fi
+    "
+}
+
+import_squash "$SQUASH_FILE" "$IMAGE"
+import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE"
 
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 
@@ -102,15 +110,11 @@ fi
 
 git clone "$SRT_SLURM_RECIPES_REPO" "$SRT_REPO_DIR"
 cd "$SRT_REPO_DIR"
-git checkout "$SRT_SLURM_RECIPES_COMMIT"
+git checkout "$SRT_SLURM_RECIPES_REF"
 
-# Overlay the hand-rolled DSV4 sglang recipes onto the upstream srt-slurm
-# checkout. Mirrors launch_gb200-nv.sh's dynamo-sglang dsv4 branch:
-# destination must be `recipes/sglang/deepseek-v4` because
-# `additional-settings: CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/...`
-# in `.github/configs/nvidia-master.yaml` is what srtctl loads.
-mkdir -p recipes/sglang/deepseek-v4
-cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4
+# Overlay the hand-rolled DSV4 recipes onto the selected srt-slurm checkout.
+mkdir -p "$SRT_RECIPE_DST"
+cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST"
 
 echo "Installing srtctl..."
 # CRITICAL — uv install location.
@@ -166,7 +170,7 @@ mkdir -p configs/dynamo-wheels
 
 echo "Creating srtslurm.yaml configuration..."
 cat > srtslurm.yaml <<EOF
-# SRT SLURM Configuration for GB300-CW (SGLang)
+# SRT SLURM Configuration for GB300-CW
 
 default_account: "${SLURM_ACCOUNT}"
 default_partition: "${SLURM_PARTITION}"
@@ -192,6 +196,7 @@ model_paths:
 containers:
   dynamo-trtllm: ${SQUASH_FILE}
   dynamo-sglang: ${SQUASH_FILE}
+  dynamo-vllm: ${SQUASH_FILE}
   dspro-0426: ${SQUASH_FILE}
   dspro-0426-nixl: ${SQUASH_FILE}
   dsv4-grace-blackwell: ${SQUASH_FILE}
diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh
index 5e4a681f8..cbda51af8 100644
--- a/runners/launch_gb300-nv.sh
+++ b/runners/launch_gb300-nv.sh
@@ -2,7 +2,7 @@
 
 # This script sets up the environment and launches multi-node benchmarks
 
-set -x
+set -exo pipefail
 
 export SLURM_PARTITION="batch_1"
 export SLURM_ACCOUNT="benchmark"
@@ -18,8 +18,11 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then
     export SERVED_MODEL_NAME="deepseek-r1-fp8"
     export MODEL_PATH=/scratch/models/DeepSeek-R1-0528
     export SRT_SLURM_MODEL_PREFIX="dsr1-fp8"
+elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
+    export MODEL_PATH=/scratch/models/DeepSeek-V4-Pro
+    export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro"
 else
-    echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8"
+    echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4"
     exit 1
 fi
 
@@ -55,23 +58,31 @@ export ISL="$ISL"
 export OSL="$OSL"
 
 echo "Cloning srt-slurm repository..."
-SRT_REPO_DIR="srt-slurm"
-if [ -d "$SRT_REPO_DIR" ]; then
-    echo "Removing existing $SRT_REPO_DIR..."
-    rm -rf "$SRT_REPO_DIR"
+RUN_KEY=$(printf "%s" "${RESULT_FILENAME:-${RUNNER_NAME:-gb300-nv}}" | sha1sum | cut -c1-12)
+SRT_REPO_DIR="${GITHUB_WORKSPACE}/srt-slurm-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}"
+rm -rf "$SRT_REPO_DIR"
+
+if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    git checkout aflowers/gb200-dsv4-recipes
+    mkdir -p recipes/vllm/deepseek-v4
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4
+else
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    git checkout sa-submission-q2-2026
 fi
 
-git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
-cd "$SRT_REPO_DIR"
-git checkout sa-submission-q2-2026
-
 echo "Installing srtctl..."
 export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin"
 curl -LsSf https://astral.sh/uv/install.sh | sh
 export PATH="$UV_INSTALL_DIR:$PATH"
 
-uv venv "$GITHUB_WORKSPACE/.venv"
-source "$GITHUB_WORKSPACE/.venv/bin/activate"
+VENV_DIR="${GITHUB_WORKSPACE}/.venv-srt-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}"
+rm -rf "$VENV_DIR"
+uv venv "$VENV_DIR"
+source "$VENV_DIR/bin/activate"
 uv pip install -e .
 
 if ! command -v srtctl &> /dev/null; then
@@ -82,7 +93,7 @@ fi
 echo "Configs available at: $SRT_REPO_DIR/"
 
 # Create srtslurm.yaml for srtctl (used by both frameworks)
-SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
+SRTCTL_ROOT="${SRT_REPO_DIR}"
 echo "Creating srtslurm.yaml configuration..."
 cat > srtslurm.yaml <<EOF
 # SRT SLURM Configuration for GB300
diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py
index e9a2195ed..aeebcfa1f 100644
--- a/utils/matrix_logic/generate_sweep_configs.py
+++ b/utils/matrix_logic/generate_sweep_configs.py
@@ -541,7 +541,24 @@ def get_lowest_conc(search_space_entry):
     return matrix_values
 
 
-def generate_test_config_sweep(args, all_config_data):
+def _runner_values_for_filter(runner: str, runner_data: dict, runner_node_filter: str | None) -> list[str]:
+    if not runner_node_filter:
+        return [runner]
+
+    candidates = runner_data.get(runner, [])
+    if runner_node_filter in runner:
+        candidates = [runner, *candidates]
+
+    matches = []
+    seen = set()
+    for node in candidates:
+        if runner_node_filter in node and node not in seen:
+            matches.append(node)
+            seen.add(node)
+    return matches
+
+
+def generate_test_config_sweep(args, all_config_data, runner_data=None):
     """Generate full sweep for specific config keys.
 
     Validates that all specified config keys exist before generating.
@@ -551,6 +568,8 @@ def generate_test_config_sweep(args, all_config_data):
 
     matrix_values = []
 
+    runner_data = runner_data or {}
+
     for key in resolved_keys:
         val = all_config_data[key]
         is_multinode = val.get(Fields.MULTINODE.value, False)
@@ -561,6 +580,10 @@ def generate_test_config_sweep(args, all_config_data):
         precision = val[Fields.PRECISION.value]
         framework = val[Fields.FRAMEWORK.value]
         runner = val[Fields.RUNNER.value]
+        runners_for_entry = _runner_values_for_filter(
+            runner, runner_data, getattr(args, 'runner_node_filter', None))
+        if not runners_for_entry:
+            continue
         disagg = val.get(Fields.DISAGG.value, False)
 
         # Build seq-len filter if --seq-lens was provided
@@ -607,25 +630,26 @@ def generate_test_config_sweep(args, all_config_data):
                             # No intersection with requested conc values; skip
                             continue
 
-                    entry = {
-                        Fields.IMAGE.value: image,
-                        Fields.MODEL.value: model,
-                        Fields.MODEL_PREFIX.value: model_code,
-                        Fields.PRECISION.value: precision,
-                        Fields.FRAMEWORK.value: framework,
-                        Fields.RUNNER.value: runner,
-                        Fields.ISL.value: isl,
-                        Fields.OSL.value: osl,
-                        Fields.SPEC_DECODING.value: spec_decoding,
-                        Fields.PREFILL.value: prefill,
-                        Fields.DECODE.value: decode,
-                        Fields.CONC.value: conc_values,
-                        Fields.MAX_MODEL_LEN.value: isl + osl + 256,
-                        Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
-                        Fields.DISAGG.value: disagg,
-                        Fields.RUN_EVAL.value: False,
-                    }
-                    matrix_values.append(validate_matrix_entry(entry, is_multinode=True))
+                    for runner_value in runners_for_entry:
+                        entry = {
+                            Fields.IMAGE.value: image,
+                            Fields.MODEL.value: model,
+                            Fields.MODEL_PREFIX.value: model_code,
+                            Fields.PRECISION.value: precision,
+                            Fields.FRAMEWORK.value: framework,
+                            Fields.RUNNER.value: runner_value,
+                            Fields.ISL.value: isl,
+                            Fields.OSL.value: osl,
+                            Fields.SPEC_DECODING.value: spec_decoding,
+                            Fields.PREFILL.value: prefill,
+                            Fields.DECODE.value: decode,
+                            Fields.CONC.value: conc_values,
+                            Fields.MAX_MODEL_LEN.value: isl + osl + 256,
+                            Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
+                            Fields.DISAGG.value: disagg,
+                            Fields.RUN_EVAL.value: False,
+                        }
+                        matrix_values.append(validate_matrix_entry(entry, is_multinode=True))
                 else:
                     # Single-node config
                     tp = bmk[Fields.TP.value]
@@ -657,26 +681,27 @@ def generate_test_config_sweep(args, all_config_data):
                             continue
 
                     for conc in conc_values:
-                        entry = {
-                            Fields.IMAGE.value: image,
-                            Fields.MODEL.value: model,
-                            Fields.MODEL_PREFIX.value: model_code,
-                            Fields.PRECISION.value: precision,
-                            Fields.FRAMEWORK.value: framework,
-                            Fields.RUNNER.value: runner,
-                            Fields.ISL.value: isl,
-                            Fields.OSL.value: osl,
-                            Fields.TP.value: tp,
-                            Fields.CONC.value: conc,
-                            Fields.MAX_MODEL_LEN.value: isl + osl + 256,
-                            Fields.EP.value: ep if ep is not None else 1,
-                            Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False,
-                            Fields.SPEC_DECODING.value: spec_decoding,
-                            Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
-                            Fields.DISAGG.value: disagg,
-                            Fields.RUN_EVAL.value: False,
-                        }
-                        matrix_values.append(validate_matrix_entry(entry, is_multinode=False))
+                        for runner_value in runners_for_entry:
+                            entry = {
+                                Fields.IMAGE.value: image,
+                                Fields.MODEL.value: model,
+                                Fields.MODEL_PREFIX.value: model_code,
+                                Fields.PRECISION.value: precision,
+                                Fields.FRAMEWORK.value: framework,
+                                Fields.RUNNER.value: runner_value,
+                                Fields.ISL.value: isl,
+                                Fields.OSL.value: osl,
+                                Fields.TP.value: tp,
+                                Fields.CONC.value: conc,
+                                Fields.MAX_MODEL_LEN.value: isl + osl + 256,
+                                Fields.EP.value: ep if ep is not None else 1,
+                                Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False,
+                                Fields.SPEC_DECODING.value: spec_decoding,
+                                Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}",
+                                Fields.DISAGG.value: disagg,
+                                Fields.RUN_EVAL.value: False,
+                            }
+                            matrix_values.append(validate_matrix_entry(entry, is_multinode=False))
 
     return matrix_values
 
@@ -947,7 +972,7 @@ def main():
         matrix_values = generate_runner_model_sweep_config(
             args, all_config_data, runner_data)
     elif args.command == 'test-config':
-        matrix_values = generate_test_config_sweep(args, all_config_data)
+        matrix_values = generate_test_config_sweep(args, all_config_data, runner_data)
     else:
         parser.error(f"Unknown command: {args.command}")
         
diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py
index a03ded47f..34bd4dc3d 100644
--- a/utils/matrix_logic/test_generate_sweep_configs.py
+++ b/utils/matrix_logic/test_generate_sweep_configs.py
@@ -8,6 +8,7 @@
     seq_len_to_str,
     generate_full_sweep,
     generate_runner_model_sweep_config,
+    generate_test_config_sweep,
     mark_eval_entries,
     apply_node_type_defaults,
     expand_config_keys,
@@ -1534,6 +1535,49 @@ def full_sweep_args_both():
     return args
 
 
+# =============================================================================
+# Test generate_test_config_sweep
+# =============================================================================
+
+class TestGenerateTestConfigSweep:
+    """Tests for exact config-key sweep generation."""
+
+    def test_runner_node_filter_expands_config_runner(self, sample_multinode_config, sample_runner_config):
+        """test-config should allow targeting one concrete runner node."""
+        args = argparse.Namespace(
+            config_keys=["dsr1-fp4-gb200-dynamo-trt"],
+            seq_lens=None,
+            conc=None,
+            runner_node_filter="gb200-nv_0",
+        )
+
+        result = generate_test_config_sweep(
+            args,
+            sample_multinode_config,
+            sample_runner_config,
+        )
+
+        assert len(result) == 1
+        assert result[0]["runner"] == "gb200-nv_0"
+
+    def test_runner_node_filter_no_match_skips_config(self, sample_multinode_config, sample_runner_config):
+        """Unmatched node filters should produce no entries."""
+        args = argparse.Namespace(
+            config_keys=["dsr1-fp4-gb200-dynamo-trt"],
+            seq_lens=None,
+            conc=None,
+            runner_node_filter="gb300-nv_0",
+        )
+
+        result = generate_test_config_sweep(
+            args,
+            sample_multinode_config,
+            sample_runner_config,
+        )
+
+        assert result == []
+
+
 # =============================================================================
 # Test apply_node_type_defaults
 # =============================================================================
@@ -1885,4 +1929,3 @@ def test_prefill_entries_never_in_single_or_evals(self, mixed_entries):
         assert all('prefill' not in x for x in single)
         assert all('prefill' not in x for x in evals)
 
-