From 9e93341dddce20df6f098ddd3a41aef1b42fba67 Mon Sep 17 00:00:00 2001 From: Alec Flowers Date: Thu, 30 Apr 2026 14:50:36 -0700 Subject: [PATCH 1/3] Add GB200 DSV4 Dynamo vLLM MTP2 recipes --- .github/configs/nvidia-master.yaml | 83 +++++++++ .../disagg-gb200-high-tpt-megamoe-mtp2.yaml | 157 +++++++++++++++++ .../8k1k/disagg-gb200-low-latency-mtp2.yaml | 145 ++++++++++++++++ .../disagg-gb200-low-middle-curve-mtp2.yaml | 154 +++++++++++++++++ .../disagg-gb200-mid-curve-megamoe-mtp2.yaml | 159 ++++++++++++++++++ perf-changelog.yaml | 7 + 6 files changed, 705 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 15df0cc5f..7fe815eaf 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7749,6 +7749,89 @@ dsv4-fp4-gb200-dynamo-vllm: ep: 8 dp-attn: true +# MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM nightly image +# and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm. +dsv4-fp4-gb200-dynamo-vllm-mtp2: + image: vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # Low latency: 2 prefill (TP=8 each) + 1 decode (TP=8). 7 nodes total + # with a dedicated NATS/etcd infra node. + - conc-list: [1, 2, 4, 8, 16] + spec-decoding: mtp + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + + # Low-middle transition: 1 prefill (DEP=8) + 4 decode (TP=8). + # 11 nodes total with a dedicated NATS/etcd infra node. + - conc-list: [16] + spec-decoding: mtp + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false + + # MegaMOE mid curve: 1 prefill (DEP=8) + 1 decode (DEP=8). + # 5 nodes total with a dedicated NATS/etcd infra node. + - conc-list: [128] + spec-decoding: mtp + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + + # MegaMOE high throughput: 2 prefill (DEP=8 each) + 1 decode (DEP=8). + # 7 nodes total with a dedicated NATS/etcd infra node. + - conc-list: [1024] + spec-decoding: mtp + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + dsv4-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:deepseek-v4-grace-blackwell model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml new file mode 100644 index 000000000..3ae5f52be --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml @@ -0,0 +1,157 @@ +name: "svf-vllm-disagg-gb200-high-tpt-megamoe-mtp2" + +# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml +# +# Topology: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 7 nodes total with a +# dedicated NATS/etcd infra node. MegaMOE MTP2 high-throughput point at +# concurrency 1024 with no CPU/NVMe offload. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to the vLLM nightly image to match +# nvidia-master.yaml image. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.94 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 1024 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml new file mode 100644 index 000000000..fda9903d4 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml @@ -0,0 +1,145 @@ +name: "svf-vllm-disagg-gb200-low-latency-mtp2" + +# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-latency-mtp2.yaml +# +# Topology: 2 prefill (TP=8 each) + 1 decode (TP=8). 7 nodes total with a +# dedicated NATS/etcd infra node. MTP2 low-latency points at concurrencies +# 1/2/4/8/16. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to the vLLM nightly image to match +# nvidia-master.yaml image. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "tp8-prefill-2p1d-mtp2"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enforce-eager: true + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 8 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "tp8-decode-2p1d-mtp2"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 64 + max-cudagraph-capture-size: 64 + max-num-batched-tokens: 64 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x2x4x8x16" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml new file mode 100644 index 000000000..0fc0f5f4b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml @@ -0,0 +1,154 @@ +name: "svf-vllm-disagg-gb200-low-middle-curve-mtp2" + +# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml +# +# Topology: 1 prefill (DEP=8) + 4 decode (TP=8). 11 nodes total with a +# dedicated NATS/etcd infra node. MTP2 low-middle transition point at +# concurrency 16. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to the vLLM nightly image to match +# nvidia-master.yaml image. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 8 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml new file mode 100644 index 000000000..41cb7e1ef --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml @@ -0,0 +1,159 @@ +name: "svf-vllm-disagg-gb200-mid-curve-megamoe-mtp2" + +# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch: +# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml +# +# Topology: 1 prefill (DEP=8) + 1 decode (DEP=8). 5 nodes total with a +# dedicated NATS/etcd infra node. MegaMOE MTP2 mid-curve point at +# concurrency 128 with no CPU/NVMe offload. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. +# * model.container set to the vLLM nightly image to match +# nvidia-master.yaml image. +# * slurm.time_limit + health_check set to 8h / 1440 attempts to +# absorb cold-cache /mnt/numa1 model loads. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + enforce-eager: true + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 8 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + numa-bind: true + tokenizer-mode: deepseek_v4 + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enable-ep-weight-filter: true + moe-backend: deep_gemm_mega_moe + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + attention-config: '{"use_fp4_indexer_cache":true}' + max-model-len: 9280 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "128" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0e13cb570..1753cc1d5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2056,3 +2056,10 @@ - "Add --gpu-memory-utilization 0.9 to server launch" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1126 +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm-mtp2 + description: + - "Add DeepSeek-V4-Pro FP4 GB200 Dynamo vLLM MTP2 Pareto recipes using vLLM nightly image" + - "Recipes cover 8k/1k low-latency 2P/1D TP8 conc=1/2/4/8/16, low-middle 1P/4D DEP8/TP8 conc=16, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024" + - "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242 From cd137a1573dbe68d09d90a67ba0c09c887e227d2 Mon Sep 17 00:00:00 2001 From: Alec Flowers Date: Thu, 30 Apr 2026 18:44:57 -0700 Subject: [PATCH 2/3] Update GB200 DSV4 MTP2 Pareto recipes --- .github/configs/nvidia-master.yaml | 18 +- .../8k1k/agg-gb200-low-latency-mtp2.yaml | 86 ++++++++++ .../disagg-gb200-high-tpt-megamoe-mtp2.yaml | 20 +-- .../8k1k/disagg-gb200-low-latency-mtp2.yaml | 112 ++++++------- .../disagg-gb200-low-middle-curve-mtp2.yaml | 154 ------------------ .../disagg-gb200-mid-curve-megamoe-mtp2.yaml | 14 -- 6 files changed, 147 insertions(+), 257 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7fe815eaf..56521e394 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7764,26 +7764,24 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: - isl: 8192 osl: 1024 search-space: - # Low latency: 2 prefill (TP=8 each) + 1 decode (TP=8). 7 nodes total - # with a dedicated NATS/etcd infra node. - - conc-list: [1, 2, 4, 8, 16] + # Aggregate low latency: TP=8, max-num-seqs=4. + - conc-list: [1] spec-decoding: mtp prefill: - num-worker: 2 + num-worker: 1 tp: 8 ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml" decode: - num-worker: 1 + num-worker: 0 tp: 8 ep: 1 dp-attn: false - # Low-middle transition: 1 prefill (DEP=8) + 4 decode (TP=8). - # 11 nodes total with a dedicated NATS/etcd infra node. - - conc-list: [16] + # Low-latency bridge: 1 prefill (DEP=8) + 4 decode (TP=8), no offload. + - conc-list: [16, 32, 64] spec-decoding: mtp prefill: num-worker: 1 @@ -7791,7 +7789,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml" decode: num-worker: 4 tp: 8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml new file mode 100644 index 000000000..a5c3877d1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml @@ -0,0 +1,86 @@ +name: "svf-vllm-agg-gb200-low-latency-mtp2" + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + aggregated_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + TORCH_SYMMMEM: "NVSHMEM" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + vllm_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + speculative-config: '{"method":"mtp","num_speculative_tokens":2}' + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + attention-config: '{"use_fp4_indexer_cache":true}' + tokenizer-mode: deepseek_v4 + max-model-len: 9280 + max-num-seqs: 4 + max-num-batched-tokens: 8192 + max-cudagraph-capture-size: 4 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.9 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" + frameworks: + dynamo: "1.2.0.dev20260426" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml index 3ae5f52be..1464135e5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml @@ -1,19 +1,5 @@ name: "svf-vllm-disagg-gb200-high-tpt-megamoe-mtp2" -# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch: -# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml -# -# Topology: 2 prefill (DEP=8 each) + 1 decode (DEP=8). 7 nodes total with a -# dedicated NATS/etcd infra node. MegaMOE MTP2 high-throughput point at -# concurrency 1024 with no CPU/NVMe offload. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to the vLLM nightly image to match -# nvidia-master.yaml image. -# * slurm.time_limit + health_check set to 8h / 1440 attempts to -# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" @@ -48,7 +34,6 @@ infra: frontend: type: dynamo enable_multiple_frontends: false - backend: type: vllm connector: null @@ -63,6 +48,8 @@ backend: VLLM_SERVER_DEV_MODE: "1" VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -77,6 +64,8 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" + # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" UCX_MEMTYPE_CACHE: "n" UCX_MEMTYPE_REG_WHOLE: "n" UCX_TLS: "cuda_copy,cuda_ipc,tcp" @@ -137,7 +126,6 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - benchmark: type: "sa-bench" isl: 8192 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml index fda9903d4..016f03755 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml @@ -1,19 +1,5 @@ -name: "svf-vllm-disagg-gb200-low-latency-mtp2" +name: svf-vllm-disagg-gb200-low-latency-mtp2 -# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch: -# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-latency-mtp2.yaml -# -# Topology: 2 prefill (TP=8 each) + 1 decode (TP=8). 7 nodes total with a -# dedicated NATS/etcd infra node. MTP2 low-latency points at concurrencies -# 1/2/4/8/16. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to the vLLM nightly image to match -# nvidia-master.yaml image. -# * slurm.time_limit + health_check set to 8h / 1440 attempts to -# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" @@ -33,66 +19,67 @@ health_check: interval_seconds: 10 resources: - gpu_type: "gb200" + gpu_type: gb200 gpus_per_node: 4 - prefill_nodes: 4 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 gpus_per_prefill: 8 gpus_per_decode: 8 - infra: etcd_nats_dedicated_node: true - frontend: type: dynamo enable_multiple_frontends: false - backend: type: vllm connector: null prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" + VLLM_ENGINE_READY_TIMEOUT_S: '3600' + TILELANG_CLEANUP_TEMP_FILES: '1' + VLLM_USE_NCCL_SYMM_MEM: '1' + NCCL_CUMEM_ENABLE: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_NVLS_ENABLE: '1' + VLLM_SERVER_DEV_MODE: '1' + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: '1024' + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: '2048' + UCX_MEMTYPE_CACHE: n + UCX_MEMTYPE_REG_WHOLE: n + UCX_TLS: cuda_copy,cuda_ipc,tcp + UCX_CUDA_IPC_ENABLE_MNNVL: y NCCL_P2P_LEVEL: NVL decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" + VLLM_ENGINE_READY_TIMEOUT_S: '3600' + TILELANG_CLEANUP_TEMP_FILES: '1' + VLLM_USE_NCCL_SYMM_MEM: '1' + NCCL_CUMEM_ENABLE: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_NVLS_ENABLE: '1' + VLLM_SERVER_DEV_MODE: '1' + UCX_MEMTYPE_CACHE: n + UCX_MEMTYPE_REG_WHOLE: n + UCX_TLS: cuda_copy,cuda_ipc,tcp + UCX_CUDA_IPC_ENABLE_MNNVL: y NCCL_P2P_LEVEL: NVL vllm_config: prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "tp8-prefill-2p1d-mtp2"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: deepseek-ai/DeepSeek-V4-Pro + kv-cache-dtype: fp8 + tensor-parallel-size: 1 pipeline-parallel-size: 1 + data-parallel-hybrid-lb: true + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true enforce-eager: true speculative-config: '{"method":"mtp","num_speculative_tokens":2}' attention-config: '{"use_fp4_indexer_cache":true}' max-model-len: 9280 max-num-seqs: 8 - max-num-batched-tokens: 32768 + max-num-batched-tokens: 16384 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true @@ -104,17 +91,17 @@ backend: numa-bind: true tokenizer-mode: deepseek_v4 decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both", "engine_id": "tp8-decode-2p1d-mtp2"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: deepseek-ai/DeepSeek-V4-Pro + kv-cache-dtype: fp8 tensor-parallel-size: 8 pipeline-parallel-size: 1 speculative-config: '{"method":"mtp","num_speculative_tokens":2}' attention-config: '{"use_fp4_indexer_cache":true}' max-model-len: 9280 - max-num-seqs: 64 - max-cudagraph-capture-size: 64 - max-num-batched-tokens: 64 + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true @@ -125,15 +112,14 @@ backend: no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true tokenizer-mode: deepseek_v4 - benchmark: - type: "sa-bench" + type: sa-bench isl: 8192 osl: 1024 - concurrencies: "1x2x4x8x16" - req_rate: "inf" + concurrencies: 16x32x64 + req_rate: inf use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + custom_tokenizer: sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer identity: model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml deleted file mode 100644 index 0fc0f5f4b..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml +++ /dev/null @@ -1,154 +0,0 @@ -name: "svf-vllm-disagg-gb200-low-middle-curve-mtp2" - -# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch: -# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-low-middle-curve-mtp2.yaml -# -# Topology: 1 prefill (DEP=8) + 4 decode (TP=8). 11 nodes total with a -# dedicated NATS/etcd infra node. MTP2 low-middle transition point at -# concurrency 16. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to the vLLM nightly image to match -# nvidia-master.yaml image. -# * slurm.time_limit + health_check set to 8h / 1440 attempts to -# absorb cold-cache /mnt/numa1 model loads. -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" - precision: "fp4" - -dynamo: - install: true - wheel: "1.2.0.dev20260426" - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 8 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -infra: - etcd_nats_dedicated_node: true - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - prefill_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" - VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL - decode_environment: - VLLM_ENGINE_READY_TIMEOUT_S: "3600" - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - TORCH_SYMMMEM: "NVSHMEM" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - UCX_MEMTYPE_CACHE: "n" - UCX_MEMTYPE_REG_WHOLE: "n" - UCX_TLS: "cuda_copy,cuda_ipc,tcp" - UCX_CUDA_IPC_ENABLE_MNNVL: "y" - NCCL_P2P_LEVEL: NVL - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-hybrid-lb: true - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - speculative-config: '{"method":"mtp","num_speculative_tokens":2}' - attention-config: '{"use_fp4_indexer_cache":true}' - max-model-len: 9280 - max-num-seqs: 8 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - no-async-scheduling: true - block-size: 256 - gpu-memory-utilization: 0.9 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - numa-bind: true - offload-group-size: 3 - offload-num-in-group: 1 - offload-prefetch-step: 2 - tokenizer-mode: deepseek_v4 - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - speculative-config: '{"method":"mtp","num_speculative_tokens":2}' - attention-config: '{"use_fp4_indexer_cache":true}' - max-model-len: 9280 - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - tokenizer-mode: deepseek_v4 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "16" - req_rate: "inf" - use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" - frameworks: - dynamo: "1.2.0.dev20260426" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml index 41cb7e1ef..001101525 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml @@ -1,19 +1,5 @@ name: "svf-vllm-disagg-gb200-mid-curve-megamoe-mtp2" -# Mirrored from NVIDIA/srt-slurm codex/pr103-agg-dsv4-mtp branch: -# recipes/vllm/deepseek-v4-pro/GB200/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml -# -# Topology: 1 prefill (DEP=8) + 1 decode (DEP=8). 5 nodes total with a -# dedicated NATS/etcd infra node. MegaMOE MTP2 mid-curve point at -# concurrency 128 with no CPU/NVMe offload. -# -# Local deltas vs upstream: -# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match -# SRT_SLURM_MODEL_PREFIX in runners/launch_gb200-nv.sh. -# * model.container set to the vLLM nightly image to match -# nvidia-master.yaml image. -# * slurm.time_limit + health_check set to 8h / 1440 attempts to -# absorb cold-cache /mnt/numa1 model loads. model: path: "deepseek-v4-pro" container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f" From dc7bdf4a1f4a2c82bbfdade7d3b62d9f3f09eebf Mon Sep 17 00:00:00 2001 From: Alec Flowers Date: Thu, 30 Apr 2026 18:47:53 -0700 Subject: [PATCH 3/3] Update GB200 MTP2 changelog description --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1753cc1d5..2dfcda9fe 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2059,7 +2059,7 @@ - config-keys: - dsv4-fp4-gb200-dynamo-vllm-mtp2 description: - - "Add DeepSeek-V4-Pro FP4 GB200 Dynamo vLLM MTP2 Pareto recipes using vLLM nightly image" - - "Recipes cover 8k/1k low-latency 2P/1D TP8 conc=1/2/4/8/16, low-middle 1P/4D DEP8/TP8 conc=16, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024" + - "Add final DeepSeek-V4-Pro FP4 GB200 Dynamo vLLM MTP2 Pareto recipes using vLLM nightly image" + - "Recipes cover 8k/1k aggregate TP8 low-latency conc=1, low-latency bridge 1P DEP8 + 4D TP8 no-offload conc=16/32/64, mid 1P/1D DEP8 MegaMOE conc=128, and high-throughput 2P/1D DEP8 MegaMOE conc=1024" - "All recipes enable FP4 indexer cache and speculative-config mtp with num_speculative_tokens=2" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1242