Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7749,6 +7749,87 @@ dsv4-fp4-gb200-dynamo-vllm:
ep: 8
dp-attn: true

# MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM nightly image
# and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm.
dsv4-fp4-gb200-dynamo-vllm-mtp2:
image: vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb200
precision: fp4
framework: dynamo-vllm
multinode: true
disagg: true
seq-len-configs:
- isl: 8192
osl: 1024
search-space:
# Aggregate low latency: TP=8, max-num-seqs=4.
- conc-list: [1]
spec-decoding: mtp
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/agg-gb200-low-latency-mtp2.yaml"
decode:
num-worker: 0
tp: 8
ep: 1
dp-attn: false

# Low-latency bridge: 1 prefill (DEP=8) + 4 decode (TP=8), no offload.
- conc-list: [16, 32, 64]
spec-decoding: mtp
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-low-latency-mtp2.yaml"
decode:
num-worker: 4
tp: 8
ep: 1
dp-attn: false

# MegaMOE mid curve: 1 prefill (DEP=8) + 1 decode (DEP=8).
# 5 nodes total with a dedicated NATS/etcd infra node.
- conc-list: [128]
spec-decoding: mtp
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-mid-curve-megamoe-mtp2.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

# MegaMOE high throughput: 2 prefill (DEP=8 each) + 1 decode (DEP=8).
# 7 nodes total with a dedicated NATS/etcd infra node.
- conc-list: [1024]
spec-decoding: mtp
prefill:
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-high-tpt-megamoe-mtp2.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

dsv4-fp4-gb300-dynamo-sglang:
image: lmsysorg/sglang:deepseek-v4-grace-blackwell
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
name: "svf-vllm-agg-gb200-low-latency-mtp2"

model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
agg_nodes: 2
agg_workers: 1
gpus_per_agg: 8

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null
aggregated_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
TORCH_SYMMMEM: "NVSHMEM"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
aggregated:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 8
pipeline-parallel-size: 1
speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}'
attention-config: '{"use_fp4_indexer_cache":true}'
tokenizer-mode: deepseek_v4
max-model-len: 9280
max-num-seqs: 4
max-num-batched-tokens: 8192
max-cudagraph-capture-size: 4
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
block-size: 256
gpu-memory-utilization: 0.9

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "1"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

identity:
model:
repo: "deepseek-ai/DeepSeek-V4-Pro"
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
container:
image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
frameworks:
dynamo: "1.2.0.dev20260426"
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
name: "svf-vllm-disagg-gb200-high-tpt-megamoe-mtp2"

model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 4
decode_nodes: 2
prefill_workers: 2
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
TORCH_SYMMMEM: "NVSHMEM"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
TORCH_SYMMMEM: "NVSHMEM"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
moe-backend: deep_gemm_mega_moe
enforce-eager: true
speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
attention-config: '{"use_fp4_indexer_cache":true}'
max-model-len: 9280
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.94
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
moe-backend: deep_gemm_mega_moe
speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
attention-config: '{"use_fp4_indexer_cache":true}'
max-model-len: 9280
max-num-seqs: 512
max-cudagraph-capture-size: 512
max-num-batched-tokens: 1024
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "1024"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

identity:
model:
repo: "deepseek-ai/DeepSeek-V4-Pro"
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
container:
image: "vllm/vllm-openai:nightly-a749a33d8d05acdd3ab346bd3f0c6b5c9c80474f"
frameworks:
dynamo: "1.2.0.dev20260426"
Loading
Loading