Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
dbb2218
Add DeepSeek V4 Flash FP4 GB200 disaggregated vLLM benchmarks via Dynamo
Oseltamivir Apr 24, 2026
1bb8494
flags
Oseltamivir Apr 24, 2026
41e71b8
import
Oseltamivir Apr 24, 2026
4854a7a
flags
Oseltamivir Apr 24, 2026
ac030e6
recipe change
Oseltamivir Apr 24, 2026
b592c60
prompt
Oseltamivir Apr 24, 2026
11a4c08
prompt
Oseltamivir Apr 24, 2026
9359fe8
prompt
Oseltamivir Apr 24, 2026
1d51ba1
weight loading
Oseltamivir Apr 24, 2026
4ce52cd
sweep
Oseltamivir Apr 24, 2026
071643b
Add 1k/1k DSV4-Pro recipes, comment out 8k/1k for now
Oseltamivir Apr 24, 2026
52b6a2e
Bump health-check and add slurm.time_limit to all DSV4 recipes
Oseltamivir Apr 24, 2026
8ed435e
Merge remote-tracking branch 'origin/main' into dsv4-fp4-gb200-dynamo…
Oseltamivir Apr 24, 2026
768cddc
Adopt NVIDIA srt-slurm PR #71 recipes (sans offload) for 8k/1k DSV4
Oseltamivir Apr 24, 2026
af10ca0
path
Oseltamivir Apr 24, 2026
f524584
Revert "Adopt NVIDIA srt-slurm PR #71 recipes (sans offload) for 8k/1…
Oseltamivir Apr 24, 2026
18100e5
Add 1k/1k 3p1d-dep8-dep16 recipe for high concurrency (4096, 8192)
Oseltamivir Apr 24, 2026
84be0b3
change concs
Oseltamivir Apr 24, 2026
8b1fbe2
Move srt-slurm-recipes/ under benchmarks/multi_node/
Oseltamivir Apr 24, 2026
e095e00
Add 1p4d-dep8-tep8 TEP recipes for low concurrency (1k/1k + 8k/1k)
Oseltamivir Apr 24, 2026
4666f60
conc changes
Oseltamivir Apr 24, 2026
e8922c5
Merge branch 'main' into dsv4-fp4-gb200-dynamo-vllm-disagg
Oseltamivir Apr 24, 2026
86bf700
Merge branch 'main' into dsv4-fp4-gb200-dynamo-vllm-disagg
Oseltamivir Apr 24, 2026
a51db71
perfchangelog
Oseltamivir Apr 24, 2026
c23c9fa
Undo 1p4d-dep8-tep8 TEP recipes
Oseltamivir Apr 25, 2026
7c8b859
Adopt NVIDIA aflowers/gb200-dsv4-recipes 1p1d-dep8-tep8 for low conc
Oseltamivir Apr 25, 2026
17b4a46
Merge branch 'main' into dsv4-fp4-gb200-dynamo-vllm-disagg
Oseltamivir Apr 25, 2026
42d9107
Re-add CPU/DRAM offload to 1p1d-dep8-tep8 recipes (load-bearing)
Oseltamivir Apr 25, 2026
47d3cdc
PR review fixes: harden cp -rT, refresh stale changelog description
Oseltamivir Apr 25, 2026
9cd8f70
activate 8k1k
Oseltamivir Apr 25, 2026
980b777
Fix 8k/1k seq-len-config indent in nvidia-master.yaml
Oseltamivir Apr 25, 2026
d1349b2
Align matrix conc-lists to recipe concurrencies (recipe is source of …
Oseltamivir Apr 25, 2026
6859910
Merge branch 'main' into dsv4-fp4-gb200-dynamo-vllm-disagg
Oseltamivir Apr 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7541,3 +7541,115 @@ kimik2.5-fp4-gb200-dynamo-vllm:
tp: 16
ep: 16
dp-attn: true

dsv4-fp4-gb200-dynamo-vllm:
image: vllm/vllm-openai:deepseekv4-cu130
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb200
precision: fp4
framework: dynamo-vllm
multinode: true
disagg: true
seq-len-configs:
# 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's
# DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg
# at this seq-len yet (PR #67 only publishes 8k/1k).
- isl: 1024
osl: 1024
search-space:
# Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
# 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch
# 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header).
- conc-list: [1, 4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
# Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16).
# 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096.
- conc-list: [128, 256, 1024, 2048, 4096]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
# The 4096 overlap with the 1p1d block gives a crossover point. 8192
# would saturate 1p1d's prefill, so this topology takes over there.
- conc-list: [4096, 8192]
prefill:
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

- isl: 8192
osl: 1024
search-space:
# Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8).
# 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch.
- conc-list: [1, 4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
# Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total.
- conc-list: [512, 1024]
prefill:
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes
# (full cluster). Mirrors NVIDIA/srt-slurm PR #67.
- conc-list: [4096, 8192]
prefill:
num-worker: 7
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16"

# 1k/1k mid-to-high throughput topology. Extrapolated from
# kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml adjusted for DSV4-Pro's
# DP>=8 minimum. Single prefill worker feeding a wide DP=16 decode handles
# conc 256-4096 cleanly for 1k prompts (prefill throughput per rank is high
# enough at this prompt length; see kimi precedent).
#
# Differences from our 8k1k 7p1d-dep8-dep16:
# * prefill_workers: 1 (vs 7) — 1k prompts don't need 14 prefill nodes
# * max-model-len: 3072 instead of auto
# * prefill max-num-seqs: 16 (fills 16384-token budget at 1k per seq)
# * decode max-num-seqs: 512 instead of 256 (shorter KV, more parallelism)
# * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512

model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:deepseekv4-cu130"
precision: "fp4"

dynamo:
hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
install: true

setup_script: vllm-container-deps.sh

# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so
# a slow first-time Lustre load + cudagraph capture can't get cut off by the
# SLURM wall clock.
slurm:
time_limit: "8:00:00"

# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from
# Lustre with multiple workers contending for the same OSTs — previous 1k/1k
# run hit the default 1800s. Make this *very* generous since the cost of an
# over-long deadline is just sitting idle, not wasted compute.
health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 4
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 16

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"

decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enforce-eager: true
max-model-len: 3072
max-num-seqs: 16
max-num-batched-tokens: 16384
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
block-size: 256
gpu-memory-utilization: 0.88
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 16
data-parallel-rpc-port: 13345
enable-expert-parallel: true
max-model-len: 3072
max-num-seqs: 512
max-cudagraph-capture-size: 512
max-num-batched-tokens: 512
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "128x256x1024x2048x4096"
req_rate: "inf"
use_chat_template: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8"

# 1k/1k variant of NVIDIA's 8k/1k 1p1d-dep8-tep8 recipe (mirrored from
# aflowers/gb200-dsv4-recipes branch). Same topology and tuning; only
# max-model-len shrinks from 9280 (8k+1k+pad) to 3072 (1k+1k+pad). No
# upstream NVIDIA reference for DSV4-Pro 1k/1k vLLM disagg yet.
#
# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets
# very low concurrency (1-64).
#
# Local deltas vs upstream 8k/1k sibling: same as the 8k/1k recipe — see
# ../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full deviation list.

model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:deepseekv4-cu130"
precision: "fp4"

dynamo:
hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
install: true

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enforce-eager: true
max-model-len: 3072
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
# CPU/DRAM expert offload — required for fit. Without these the prefill
# rank reports `Available KV cache memory: -16 GiB` and the engine
# refuses to start. Numa-bind from upstream is still off because our
# NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the
# vllm_numa_bind_hash_fix.py patch.
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
tokenizer-mode: deepseek_v4

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 8
pipeline-parallel-size: 1
enable-expert-parallel: true
max-model-len: 3072
max-num-seqs: 64
max-cudagraph-capture-size: 64
max-num-batched-tokens: 64
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
attention-config: '{"use_fp4_indexer_cache":true}'
compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x4x8x16x32x64"
req_rate: "inf"
use_chat_template: false
Loading
Loading