diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 56521e394..8a80b3fb3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7830,6 +7830,98 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: ep: 8 dp-attn: true +dsv4-fp4-gb300-dynamo-vllm: + image: vllm/vllm-openai:v0.20.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-nv + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [18] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [3072] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml" + decode: + num-worker: 2 + tp: 16 + ep: 16 + dp-attn: true + dsv4-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:deepseek-v4-grace-blackwell model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml new file mode 100644 index 000000000..a2c3ab80a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml @@ -0,0 +1,106 @@ +name: "svf-vllm-disagg-gb300-1p17d-tep4-tp4" + +# Topology: 1 prefill (TEP=4) + 17 decode (TP=4). 18 GB300 nodes (1P + 17D = 72 +# GPUs at 4 GPUs/node), NATS/etcd colocated on the prefill node. +# Wide-decode point at concurrency 18 — each decode worker holds a +# single replica. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 17 + prefill_workers: 1 + decode_workers: 17 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "18" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml new file mode 100644 index 000000000..c3b25acc1 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml @@ -0,0 +1,114 @@ +name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4" + +# Topology: 1 prefill (DEP=4) + 6 decode (TP=4). 7 GB300 nodes (1P + 6D = 28 +# GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. Low-mid curve +# point at concurrency 192. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 6 + prefill_workers: 1 + decode_workers: 6 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "192" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml new file mode 100644 index 000000000..b97ef0d5a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml @@ -0,0 +1,122 @@ +name: "svf-vllm-disagg-gb300-4p1d-dep4-dep8-24" + +# Topology: 4 prefill (DEP=4 each) + 1 decode (DEP=8). 6 GB300 nodes (4P + 2D +# = 24 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. +# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on +# both workers. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + wheel: "1.2.0.dev20260426" + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml new file mode 100644 index 000000000..d83e6d771 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml @@ -0,0 +1,122 @@ +name: "svf-vllm-disagg-gb300-5p1d-dep4-dep8-28" + +# Topology: 5 prefill (DEP=4 each) + 1 decode (DEP=8). 7 GB300 nodes (5P + 2D +# = 28 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. +# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on +# both workers. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + wheel: "1.2.0.dev20260426" + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 5 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml new file mode 100644 index 000000000..4b54cc13e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml @@ -0,0 +1,122 @@ +name: "svf-vllm-disagg-gb300-6p1d-dep4-dep8-32" + +# Topology: 6 prefill (DEP=4 each) + 1 decode (DEP=8). 8 GB300 nodes (6P + 2D +# = 32 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. +# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on +# both workers. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + wheel: "1.2.0.dev20260426" + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml new file mode 100644 index 000000000..43c2031a8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml @@ -0,0 +1,119 @@ +name: "svf-vllm-disagg-gb300-7p2d-dep4-dep16" + +# Topology: 7 prefill (DEP=4) + 2 decode (DEP=16). 15 GB300 nodes (7P + 8D +# = 60 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. +# Wide-EP decode max-throughput point at concurrency 3072. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 7 + decode_nodes: 8 + prefill_workers: 7 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "3072" + req_rate: "inf" + tokenizer_mode: "deepseek_v4" + use_chat_template: true diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2dfcda9fe..0403c2385 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2041,6 +2041,12 @@ - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 +- config-keys: + - dsv4-fp4-gb300-dynamo-vllm + description: + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated Dynamo vLLM benchmarks at 8k/1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1238 + - config-keys: - qwen3.5-fp8-b200-sglang description: diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index eaac2a1e0..413df8d8e 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -7,12 +7,36 @@ set -x -if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # Weights staged on compute-node-local NVMe at /scratch/models/dsv4/. - # The exact upstream recipes refer to this model as `dspro`. +if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Weights staged on compute-node-local NVMe. export MODEL_PATH="/scratch/models/dsv4/" + + if [[ $FRAMEWORK == "dynamo-sglang" ]]; then + # Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers` + # (= NVIDIA/srt-slurm@9d75f82 + sa-bench parallel random prompt + # generation). The single-threaded random prompt generator in the + # upstream sa-bench dominates bench startup on the 7p1d/conc=8192 + # sweep (~50 min for the main pass alone before the first HTTP + # request leaves the client). The fork bumps that to ~1 min via + # multiprocessing.Pool with `--random-num-workers 48`. + # + # TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR + # (https://github.com/NVIDIA/srt-slurm/pull/114) merges. + SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git" + SRT_SLURM_RECIPES_REF="4249d168208ff5ff1f30b3c1158d893cc0615bb5" + SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" + SRT_RECIPE_DST="recipes/sglang/deepseek-v4" + elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then + SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git" + SRT_SLURM_RECIPES_REF="aflowers/gb200-dsv4-recipes" + SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" + SRT_RECIPE_DST="recipes/vllm/deepseek-v4" + else + echo "Unsupported framework on gb300-cw for dsv4/fp4: $FRAMEWORK. Currently supported: dynamo-sglang, dynamo-vllm" + exit 1 + fi else - echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang" + echo "Unsupported model prefix/precision combination on gb300-cw: $MODEL_PREFIX/$PRECISION. Currently supported: dsv4/fp4" exit 1 fi @@ -32,18 +56,6 @@ export NVIDIA_VISIBLE_DEVICES=all export NVIDIA_DRIVER_CAPABILITIES=compute,utility NGINX_IMAGE="nginx:1.27.4" -# Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers` -# (= NVIDIA/srt-slurm@9d75f82 + sa-bench parallel random prompt -# generation). The single-threaded random prompt generator in the -# upstream sa-bench dominates bench startup on the 7p1d/conc=8192 -# sweep (~50 min for the main pass alone before the first HTTP -# request leaves the client). The fork bumps that to ~1 min via -# multiprocessing.Pool with `--random-num-workers 48`. -# -# TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR -# (https://github.com/NVIDIA/srt-slurm/pull/114) merges. -SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git" -SRT_SLURM_RECIPES_COMMIT="4249d168208ff5ff1f30b3c1158d893cc0615bb5" # Squash files live alongside models on /mnt/vast (shared across nodes). # `squash_dupe` instead of `squash` to use '_'-separated names: srtctl / @@ -51,33 +63,29 @@ SRT_SLURM_RECIPES_COMMIT="4249d168208ff5ff1f30b3c1158d893cc0615bb5" # old /mnt/vast/squash dir contains '+'-separated files from prior runs. SQUASH_DIR="/mnt/vast/squash_dupe" mkdir -p "$SQUASH_DIR" -# Compute nodes (slurm-gb300-138-*, slurm-gb300-139-*) are aarch64; the -# image `lmsysorg/sglang:deepseek-v4-grace-blackwell` is published as -# arm64-only. The CI runner pod is x86_64 and (a) cannot run -# `enroot import` for the arm64 manifest because `enroot-aufs2ovlfs` -# needs CAP_SYS_ADMIN that the pod lacks ("Operation not permitted"), -# and (b) even with `--arch aarch64` the conversion still fails on x86. -# Per `https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780` -# the import has to be dispatched to an arm64 compute node via srun. -# To keep CI self-contained we instead pin to the pre-staged arm64 sqsh -# under /mnt/vast/squash_dupe/ (refreshed manually by running that gist -# script when the docker tag is updated). Filename suffix `_arm64` -# distinguishes the working arm64 sqsh from any stale amd64 shadow. SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh" NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh" -if [[ ! -f "$SQUASH_FILE" ]]; then - echo "ERROR: pre-staged arm64 sqsh missing: $SQUASH_FILE" >&2 - echo "Refresh it on a GB300 compute node via the script in the gist:" >&2 - echo " https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780" >&2 - exit 1 -fi -if [[ ! -f "$NGINX_SQUASH_FILE" ]]; then - echo "ERROR: pre-staged arm64 nginx sqsh missing: $NGINX_SQUASH_FILE" >&2 - echo "Run on an aarch64 host:" >&2 - echo " enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" >&2 - exit 1 -fi +# Run the import on a compute node via srun, not on the runner pod: +# the runner pod is x86_64 while the compute nodes are aarch64, so the +# arm64 squash file has to be built on a compute node. +import_squash() { + local squash="$1" image="$2" + local lock="${squash}.lock" + srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --exclusive --time=180 bash -c " + exec 9>\"$lock\" + flock -w 600 9 || { echo 'Failed to acquire lock for $squash' >&2; exit 1; } + if unsquashfs -l \"$squash\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import: $squash' + else + rm -f \"$squash\" + enroot import -o \"$squash\" docker://$image + fi + " +} + +import_squash "$SQUASH_FILE" "$IMAGE" +import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE" export EVAL_ONLY="${EVAL_ONLY:-false}" @@ -102,15 +110,11 @@ fi git clone "$SRT_SLURM_RECIPES_REPO" "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" -git checkout "$SRT_SLURM_RECIPES_COMMIT" +git checkout "$SRT_SLURM_RECIPES_REF" -# Overlay the hand-rolled DSV4 sglang recipes onto the upstream srt-slurm -# checkout. Mirrors launch_gb200-nv.sh's dynamo-sglang dsv4 branch: -# destination must be `recipes/sglang/deepseek-v4` because -# `additional-settings: CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/...` -# in `.github/configs/nvidia-master.yaml` is what srtctl loads. -mkdir -p recipes/sglang/deepseek-v4 -cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 +# Overlay the hand-rolled DSV4 recipes onto the selected srt-slurm checkout. +mkdir -p "$SRT_RECIPE_DST" +cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST" echo "Installing srtctl..." # CRITICAL — uv install location. @@ -166,7 +170,7 @@ mkdir -p configs/dynamo-wheels echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml < /dev/null; then @@ -82,7 +93,7 @@ fi echo "Configs available at: $SRT_REPO_DIR/" # Create srtslurm.yaml for srtctl (used by both frameworks) -SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +SRTCTL_ROOT="${SRT_REPO_DIR}" echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml < list[str]: + if not runner_node_filter: + return [runner] + + candidates = runner_data.get(runner, []) + if runner_node_filter in runner: + candidates = [runner, *candidates] + + matches = [] + seen = set() + for node in candidates: + if runner_node_filter in node and node not in seen: + matches.append(node) + seen.add(node) + return matches + + +def generate_test_config_sweep(args, all_config_data, runner_data=None): """Generate full sweep for specific config keys. Validates that all specified config keys exist before generating. @@ -551,6 +568,8 @@ def generate_test_config_sweep(args, all_config_data): matrix_values = [] + runner_data = runner_data or {} + for key in resolved_keys: val = all_config_data[key] is_multinode = val.get(Fields.MULTINODE.value, False) @@ -561,6 +580,10 @@ def generate_test_config_sweep(args, all_config_data): precision = val[Fields.PRECISION.value] framework = val[Fields.FRAMEWORK.value] runner = val[Fields.RUNNER.value] + runners_for_entry = _runner_values_for_filter( + runner, runner_data, getattr(args, 'runner_node_filter', None)) + if not runners_for_entry: + continue disagg = val.get(Fields.DISAGG.value, False) # Build seq-len filter if --seq-lens was provided @@ -607,25 +630,26 @@ def generate_test_config_sweep(args, all_config_data): # No intersection with requested conc values; skip continue - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.MODEL_PREFIX.value: model_code, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner, - Fields.ISL.value: isl, - Fields.OSL.value: osl, - Fields.SPEC_DECODING.value: spec_decoding, - Fields.PREFILL.value: prefill, - Fields.DECODE.value: decode, - Fields.CONC.value: conc_values, - Fields.MAX_MODEL_LEN.value: isl + osl + 256, - Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", - Fields.DISAGG.value: disagg, - Fields.RUN_EVAL.value: False, - } - matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) + for runner_value in runners_for_entry: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.CONC.value: conc_values, + Fields.MAX_MODEL_LEN.value: isl + osl + 256, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + Fields.RUN_EVAL.value: False, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) else: # Single-node config tp = bmk[Fields.TP.value] @@ -657,26 +681,27 @@ def generate_test_config_sweep(args, all_config_data): continue for conc in conc_values: - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.MODEL_PREFIX.value: model_code, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner, - Fields.ISL.value: isl, - Fields.OSL.value: osl, - Fields.TP.value: tp, - Fields.CONC.value: conc, - Fields.MAX_MODEL_LEN.value: isl + osl + 256, - Fields.EP.value: ep if ep is not None else 1, - Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, - Fields.SPEC_DECODING.value: spec_decoding, - Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", - Fields.DISAGG.value: disagg, - Fields.RUN_EVAL.value: False, - } - matrix_values.append(validate_matrix_entry(entry, is_multinode=False)) + for runner_value in runners_for_entry: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.TP.value: tp, + Fields.CONC.value: conc, + Fields.MAX_MODEL_LEN.value: isl + osl + 256, + Fields.EP.value: ep if ep is not None else 1, + Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + Fields.RUN_EVAL.value: False, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=False)) return matrix_values @@ -947,7 +972,7 @@ def main(): matrix_values = generate_runner_model_sweep_config( args, all_config_data, runner_data) elif args.command == 'test-config': - matrix_values = generate_test_config_sweep(args, all_config_data) + matrix_values = generate_test_config_sweep(args, all_config_data, runner_data) else: parser.error(f"Unknown command: {args.command}") diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index a03ded47f..34bd4dc3d 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -8,6 +8,7 @@ seq_len_to_str, generate_full_sweep, generate_runner_model_sweep_config, + generate_test_config_sweep, mark_eval_entries, apply_node_type_defaults, expand_config_keys, @@ -1534,6 +1535,49 @@ def full_sweep_args_both(): return args +# ============================================================================= +# Test generate_test_config_sweep +# ============================================================================= + +class TestGenerateTestConfigSweep: + """Tests for exact config-key sweep generation.""" + + def test_runner_node_filter_expands_config_runner(self, sample_multinode_config, sample_runner_config): + """test-config should allow targeting one concrete runner node.""" + args = argparse.Namespace( + config_keys=["dsr1-fp4-gb200-dynamo-trt"], + seq_lens=None, + conc=None, + runner_node_filter="gb200-nv_0", + ) + + result = generate_test_config_sweep( + args, + sample_multinode_config, + sample_runner_config, + ) + + assert len(result) == 1 + assert result[0]["runner"] == "gb200-nv_0" + + def test_runner_node_filter_no_match_skips_config(self, sample_multinode_config, sample_runner_config): + """Unmatched node filters should produce no entries.""" + args = argparse.Namespace( + config_keys=["dsr1-fp4-gb200-dynamo-trt"], + seq_lens=None, + conc=None, + runner_node_filter="gb300-nv_0", + ) + + result = generate_test_config_sweep( + args, + sample_multinode_config, + sample_runner_config, + ) + + assert result == [] + + # ============================================================================= # Test apply_node_type_defaults # ============================================================================= @@ -1885,4 +1929,3 @@ def test_prefill_entries_never_in_single_or_evals(self, mixed_entries): assert all('prefill' not in x for x in single) assert all('prefill' not in x for x in evals) -