Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7830,6 +7830,98 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
ep: 8
dp-attn: true

dsv4-fp4-gb300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.0-ubuntu2404
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb300-nv
precision: fp4
framework: dynamo-vllm
Comment thread
alec-flowers marked this conversation as resolved.
multinode: true
disagg: true
seq-len-configs:
- isl: 8192
osl: 1024
search-space:
- conc-list: [192]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml"
decode:
num-worker: 6
tp: 4
ep: 1
dp-attn: false
- conc-list: [18]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml"
decode:
num-worker: 17
tp: 4
ep: 1
dp-attn: false
- conc-list: [4096]
prefill:
num-worker: 4
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [4096]
prefill:
num-worker: 5
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [4096]
prefill:
num-worker: 6
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [3072]
prefill:
num-worker: 7
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml"
decode:
num-worker: 2
tp: 16
ep: 16
dp-attn: true

dsv4-fp4-gb300-dynamo-sglang:
image: lmsysorg/sglang:deepseek-v4-grace-blackwell
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
name: "svf-vllm-disagg-gb300-1p17d-tep4-tp4"

# Topology: 1 prefill (TEP=4) + 17 decode (TP=4). 18 GB300 nodes (1P + 17D = 72
# GPUs at 4 GPUs/node), NATS/etcd colocated on the prefill node.
# Wide-decode point at concurrency 18 — each decode worker holds a
# single replica.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 17
prefill_workers: 1
decode_workers: 17
gpus_per_prefill: 4
gpus_per_decode: 4

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 4
pipeline-parallel-size: 1
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 256
max-num-batched-tokens: 16384
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.9
enable-ep-weight-filter: true
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 4
pipeline-parallel-size: 1
max-model-len: 16384
max-num-seqs: 512
max-cudagraph-capture-size: 512
max-num-batched-tokens: 512
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-ep-weight-filter: true
all2all-backend: "flashinfer_nvlink_one_sided"
no-enable-flashinfer-autotune: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "18"
req_rate: "inf"
tokenizer_mode: "deepseek_v4"
use_chat_template: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4"

# Topology: 1 prefill (DEP=4) + 6 decode (TP=4). 7 GB300 nodes (1P + 6D = 28
# GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. Low-mid curve
# point at concurrency 192.
model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 6
prefill_workers: 1
decode_workers: 6
gpus_per_prefill: 4
gpus_per_decode: 4

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 4
data-parallel-rpc-port: 13345
enable-expert-parallel: true
attention-config: '{"use_fp4_indexer_cache": true}'
moe-backend: "deep_gemm_mega_moe"
enforce-eager: true
max-model-len: 16384
max-num-seqs: 256
max-num-batched-tokens: 16384
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.9
enable-ep-weight-filter: true
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 4
pipeline-parallel-size: 1
max-model-len: 16384
max-num-seqs: 512
max-cudagraph-capture-size: 512
max-num-batched-tokens: 512
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
Comment thread
alec-flowers marked this conversation as resolved.
enable-ep-weight-filter: true
all2all-backend: "flashinfer_nvlink_one_sided"
no-enable-flashinfer-autotune: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "192"
req_rate: "inf"
tokenizer_mode: "deepseek_v4"
use_chat_template: true
Loading
Loading