Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7639,7 +7639,7 @@ dsv4-fp4-gb200-dynamo-vllm:
- isl: 8192
osl: 1024
search-space:
# Three validated 8k/1k points mirrored from NVIDIA/srt-slurm
# Validated 8k/1k points mirrored from NVIDIA/srt-slurm
# aflowers/vllm-gb200-v0.20.0 history. conc-list values match each
# recipe's benchmark.concurrencies.

Expand All @@ -7659,6 +7659,22 @@ dsv4-fp4-gb200-dynamo-vllm:
ep: 1
dp-attn: false

# Low-middle curve: 1 prefill (DEP=8) + 4 decode (TP=8). 11 nodes total
# with a dedicated NATS/etcd infra node.
- conc-list: [256, 512]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve.yaml"
decode:
num-worker: 4
tp: 8
ep: 1
dp-attn: false

# Mid curve: 1 prefill (DEP=8) + 1 decode (DEP=8). 5 nodes total with
# a dedicated NATS/etcd infra node.
- conc-list: [256]
Expand Down Expand Up @@ -7690,3 +7706,19 @@ dsv4-fp4-gb200-dynamo-vllm:
tp: 8
ep: 8
dp-attn: true

# MegaMOE max throughput: same 3 prefill (DEP=8 each) + 1 decode (DEP=8)
# shape, but uses deep_gemm_mega_moe on both workers and disables offload.
- conc-list: [4096]
prefill:
num-worker: 3
tp: 8
Comment thread
alec-flowers marked this conversation as resolved.
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-max-tpt-megamoe.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
name: "svf-vllm-disagg-gb200-low-middle-curve"

# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch:
# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-middle-curve.yaml
#
# Topology: 1 prefill (DEP=8) + 4 decode (TP=8). 11 nodes total with a
# dedicated NATS/etcd infra node. Low-middle curve points at concurrencies
# 256 and 512.
#
# Local deltas vs upstream:
# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to
# match nvidia-master.yaml image (which the launch script registers as
# the alias key in srtslurm.yaml). Upstream variants ship either the
# non-dynamo floating tag or a sha256 pin.
# * slurm.time_limit + health_check set to 8h / 1440 attempts to
# absorb cold-cache /mnt/numa1 model loads.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10
resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 8
prefill_workers: 1
decode_workers: 4
gpus_per_prefill: 8
gpus_per_decode: 8

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
# offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 8
pipeline-parallel-size: 1
# data-parallel-size: 8
# data-parallel-rpc-port: 13345
# enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 256
max-cudagraph-capture-size: 256
max-num-batched-tokens: 256
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "256x512"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

identity:
container:
image: "vllm/vllm-openai:v0.20.0-ubuntu2404"
frameworks:
dynamo: "1.2.0.dev20260426"
vllm: "0.20.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
name: "svf-vllm-disagg-gb200-max-tpt-megamoe"

# Mirrored from NVIDIA/srt-slurm aflowers/vllm-gb200-v0.20.0 branch:
# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-max-tpt-megamoe.yaml
#
# Topology: 3 prefill (DEP=8 each) + 1 decode (DEP=8). 9 nodes total with a
# dedicated NATS/etcd infra node. MegaMOE max-throughput point at concurrency
# 4096 with no CPU/NVMe offload.
#
# Local deltas vs upstream:
# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match
# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh.
# * model.container set to vllm/vllm-openai:v0.20.0-ubuntu2404 to
# match nvidia-master.yaml image (which the launch script registers as
# the alias key in srtslurm.yaml). Upstream variants ship either the
# non-dynamo floating tag or a sha256 pin.
# * slurm.time_limit + health_check set to 8h / 1440 attempts to
# absorb cold-cache /mnt/numa1 model loads.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10
resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 6
decode_nodes: 2
prefill_workers: 3
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
TORCH_SYMMMEM: "NVSHMEM"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
TORCH_SYMMMEM: "NVSHMEM"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
moe-backend: deep_gemm_mega_moe
enforce-eager: true
max-model-len: 9280
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.95
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
moe-backend: deep_gemm_mega_moe
max-model-len: 9280
Comment thread
alec-flowers marked this conversation as resolved.
max-num-seqs: 512
max-cudagraph-capture-size: 512
max-num-batched-tokens: 512
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "4096"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

identity:
model:
repo: "deepseek-ai/DeepSeek-V4-Pro"
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
container:
image: "vllm/vllm-openai:v0.20.0-ubuntu2404"
frameworks:
dynamo: "1.2.0.dev20260426"
vllm: "0.20.0"
16 changes: 16 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1969,3 +1969,19 @@
- "Keeps the three validated 8k/1k points: low-latency 1P/1D TP8 conc=1, mid-curve 1P/1D DEP8 conc=256, and max-tpt 3P/1D DEP8 conc=4096"
- "All three recipes run NATS/etcd on a dedicated infra node and use compute-node local NVMe model weights via /mnt/numa1/models/deepseek-v4-pro/"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1163

- config-keys:
- dsv4-fp4-gb200-dynamo-vllm
description:
- "Add GB200 Dynamo vLLM MegaMOE max-throughput recipe at conc=4096"
- "Topology matches max-tpt: 3 prefill DEP8 workers and 1 decode DEP8 worker with dedicated NATS/etcd"
- "Uses deep_gemm_mega_moe on prefill/decode, TORCH_SYMMMEM=NVSHMEM, and no offload"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218

- config-keys:
- dsv4-fp4-gb200-dynamo-vllm
description:
- "Add GB200 Dynamo vLLM low-middle curve recipe at conc=256/512"
- "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd"
- "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218
Loading