From 58229f0456506f21c470a4878fa936df85f43eac Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Thu, 30 Apr 2026 11:33:19 -0700 Subject: [PATCH 01/12] Add recipes --- .github/configs/nvidia-master.yaml | 92 ++++++++++++ .../8k1k/disagg-gb300-1p17d-tep4-tp4.yaml | 116 +++++++++++++++ .../8k1k/disagg-gb300-1p6d-dep4-tp4.yaml | 124 ++++++++++++++++ .../disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml | 132 ++++++++++++++++++ .../disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml | 132 ++++++++++++++++++ .../disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml | 132 ++++++++++++++++++ .../8k1k/disagg-gb300-7p2d-dep4-dep16.yaml | 129 +++++++++++++++++ perf-changelog.yaml | 6 + 8 files changed, 863 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index aff249a8b..8cc7eea7e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7749,6 +7749,98 @@ dsv4-fp4-gb200-dynamo-vllm: ep: 8 dp-attn: true +dsv4-fp4-gb300-dynamo-vllm: + image: vllm/vllm-openai:v0.20.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [192, 256] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [18, 36, 72] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [2048, 3072] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml" + decode: + num-worker: 2 + tp: 16 + ep: 16 + dp-attn: true + dsv4-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:deepseek-v4-grace-blackwell model: deepseek-ai/DeepSeek-V4-Pro diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml new file mode 100644 index 000000000..fb1bfb4e6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml @@ -0,0 +1,116 @@ +name: "svf-vllm-disagg-gb300-1p17d-tep4-tp4" + +# Topology: 1 prefill (TEP=4) + 17 decode (TP=4). 18 GB300 nodes (1P + 17D = 72 +# GPUs at 4 GPUs/node), NATS/etcd colocated on the prefill node. Sweeps +# concurrencies 18, 36, and 72 — wide-decode points where each decode +# worker holds a single replica. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 17 + prefill_workers: 1 + decode_workers: 17 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "18x36x72" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml new file mode 100644 index 000000000..ba43e47df --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml @@ -0,0 +1,124 @@ +name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4" + +# Topology: 1 prefill (DEP=4) + 6 decode (TP=4). 7 GB300 nodes (1P + 6D = 28 +# GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. Low-mid curve +# points at concurrencies 192 and 256. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 6 + prefill_workers: 1 + decode_workers: 6 + gpus_per_prefill: 4 + gpus_per_decode: 4 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "192x256" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml new file mode 100644 index 000000000..ea35bf247 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml @@ -0,0 +1,132 @@ +name: "svf-vllm-disagg-gb300-4p1d-dep4-dep8-24" + +# Topology: 4 prefill (DEP=4 each) + 1 decode (DEP=8). 6 GB300 nodes (4P + 2D +# = 24 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. +# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on +# both workers. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + wheel: "1.2.0.dev20260426" + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 4 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml new file mode 100644 index 000000000..a0221c1b5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml @@ -0,0 +1,132 @@ +name: "svf-vllm-disagg-gb300-5p1d-dep4-dep8-28" + +# Topology: 5 prefill (DEP=4 each) + 1 decode (DEP=8). 7 GB300 nodes (5P + 2D +# = 28 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. +# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on +# both workers. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + wheel: "1.2.0.dev20260426" + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 5 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml new file mode 100644 index 000000000..3b3f481cc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml @@ -0,0 +1,132 @@ +name: "svf-vllm-disagg-gb300-6p1d-dep4-dep8-32" + +# Topology: 6 prefill (DEP=4 each) + 1 decode (DEP=8). 8 GB300 nodes (6P + 2D +# = 32 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. +# Max-throughput point at concurrency 4096 with deep_gemm_mega_moe on +# both workers. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + wheel: "1.2.0.dev20260426" + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 2 + prefill_workers: 6 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_LOG_STATS_INTERVAL: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + safetensors-load-strategy: "prefetch" + block-size: 256 + gpu-memory-utilization: 0.9 + no-disable-hybrid-kv-cache-manager: true + no-async-scheduling: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + enable-sleep-mode: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml new file mode 100644 index 000000000..8332f5952 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml @@ -0,0 +1,129 @@ +name: "svf-vllm-disagg-gb300-7p2d-dep4-dep16" + +# Topology: 7 prefill (DEP=4) + 2 decode (DEP=16). 15 GB300 nodes (7P + 8D +# = 60 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. +# Wide-EP decode max-throughput sweep at concurrencies 2048 and 3072. +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:v0.20.0-ubuntu2404" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260426" + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 7 + decode_nodes: 8 + prefill_workers: 7 + decode_workers: 2 + gpus_per_prefill: 4 + gpus_per_decode: 16 + +infra: + etcd_nats_dedicated_node: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + enforce-eager: true + max-model-len: 16384 + max-num-seqs: 256 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.9 + enable-ep-weight-filter: true + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + attention-config: '{"use_fp4_indexer_cache": true}' + moe-backend: "deep_gemm_mega_moe" + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + no-enable-flashinfer-autotune: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048x3072" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" + +identity: + model: + repo: "deepseek-ai/DeepSeek-V4-Pro" + revision: "0366e4e064385807ea86b088a5c6c878ff23343b" + container: + image: "vllm/vllm-openai:v0.20.0-ubuntu2404" + frameworks: + dynamo: "1.2.0.dev20260426" + vllm: "0.20.0" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ae2b88f03..904faddca 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2031,3 +2031,9 @@ - "Topologies mirror the dsv4-fp4-gb300-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B" - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 + +- config-keys: + - dsv4-fp4-gb300-dynamo-vllm + description: + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated Dynamo vLLM benchmarks at 8k/1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1238 From 3d48dd10033494284c40aa7521b99f6c8eb37c29 Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Thu, 30 Apr 2026 11:51:44 -0700 Subject: [PATCH 02/12] fix benchmark, fix srt-slurm branch --- .../vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml | 2 +- .../vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml | 2 +- .../deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml | 2 +- .../deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml | 2 +- .../deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml | 2 +- .../vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml | 2 +- runners/launch_gb200-nv.sh | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml index fb1bfb4e6..23af4a98f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml @@ -102,8 +102,8 @@ benchmark: osl: 1024 concurrencies: "18x36x72" req_rate: "inf" + tokenizer_mode: "deepseek_v4" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml index ba43e47df..bb72ba2fa 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml @@ -110,8 +110,8 @@ benchmark: osl: 1024 concurrencies: "192x256" req_rate: "inf" + tokenizer_mode: "deepseek_v4" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml index ea35bf247..d7ea1bd31 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml @@ -118,8 +118,8 @@ benchmark: osl: 1024 concurrencies: "4096" req_rate: "inf" + tokenizer_mode: "deepseek_v4" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml index a0221c1b5..adae8f9da 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml @@ -118,8 +118,8 @@ benchmark: osl: 1024 concurrencies: "4096" req_rate: "inf" + tokenizer_mode: "deepseek_v4" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml index 3b3f481cc..5bf03fd59 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml @@ -118,8 +118,8 @@ benchmark: osl: 1024 concurrencies: "4096" req_rate: "inf" + tokenizer_mode: "deepseek_v4" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml index 8332f5952..ce962102b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml @@ -115,8 +115,8 @@ benchmark: osl: 1024 concurrencies: "2048x3072" req_rate: "inf" + tokenizer_mode: "deepseek_v4" use_chat_template: true - custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer" identity: model: diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 01be0fd29..ce1c3e13f 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -149,7 +149,7 @@ fi if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout aflowers/vllm-gb200-v0.20.0 + git checkout aflowers/gb200-dsv4-recipes # Use `cp -rT` so if the upstream branch ever ships a stub # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto # it rather than nesting (`cp -r src dst` would create From 8359a192af87ed7a6e085caa257f0e7815378f15 Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Thu, 30 Apr 2026 13:52:13 -0700 Subject: [PATCH 03/12] update runner --- .github/configs/nvidia-master.yaml | 2 +- runners/launch_gb200-nv.sh | 2 +- runners/launch_gb300-nv.sh | 19 +++++++++++++++---- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8cc7eea7e..b6872e3d4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7753,7 +7753,7 @@ dsv4-fp4-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: gb300-cw + runner: gb300 precision: fp4 framework: dynamo-vllm multinode: true diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index ce1c3e13f..01be0fd29 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -149,7 +149,7 @@ fi if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout aflowers/gb200-dsv4-recipes + git checkout aflowers/vllm-gb200-v0.20.0 # Use `cp -rT` so if the upstream branch ever ships a stub # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto # it rather than nesting (`cp -r src dst` would create diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 5f48ddcec..02556d5ec 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -18,8 +18,11 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export SERVED_MODEL_NAME="deepseek-r1-fp8" export MODEL_PATH=/raid/shared/models/deepseek-r1-0528 export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" +elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + export MODEL_PATH=/raid/shared/models/deepseek-v4-pro/ + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4" exit 1 fi @@ -43,9 +46,17 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" -cd "$SRT_REPO_DIR" -git checkout sa-submission-q2-2026 +if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout aflowers/gb200-dsv4-recipes + mkdir -p recipes/vllm/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 +else + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 +fi echo "Installing srtctl..." export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" From fa93b4d29e4cca8e050c5f978cddad790706d1ad Mon Sep 17 00:00:00 2001 From: Alec Flowers Date: Thu, 30 Apr 2026 16:10:37 -0700 Subject: [PATCH 04/12] chore: resolve dsv4 gb300 changelog merge markers --- perf-changelog.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index bff6e7ae2..cffa4f683 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2033,15 +2033,13 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 - config-keys: -<<<<<<< dsv4-gb300-vllm - dsv4-fp4-gb300-dynamo-vllm description: - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated Dynamo vLLM benchmarks at 8k/1k" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1238 -======= + +- config-keys: - qwen3.5-fp8-b200-sglang description: - updated sglang container image pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1027 - ->>>>>>> main From 2107b42c079738854232e3e012bca077866b41b2 Mon Sep 17 00:00:00 2001 From: Alec Flowers Date: Thu, 30 Apr 2026 16:34:41 -0700 Subject: [PATCH 05/12] fix: use gb300 local dsv4 model path --- runners/launch_gb300-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index b3b1954ee..2bd7ee26f 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -19,7 +19,7 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then export MODEL_PATH=/scratch/models/DeepSeek-R1-0528 export SRT_SLURM_MODEL_PREFIX="dsr1-fp8" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - export MODEL_PATH=/raid/shared/models/deepseek-v4-pro/ + export MODEL_PATH=/scratch/models/DeepSeek-V4-Pro export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4" From 9ffe98b4a028db885b5b0c930d3cf6d4c521ecfd Mon Sep 17 00:00:00 2001 From: Alec Flowers Date: Thu, 30 Apr 2026 16:44:33 -0700 Subject: [PATCH 06/12] ci: support runner filtering for test configs --- utils/matrix_logic/generate_sweep_configs.py | 107 +++++++++++------- .../test_generate_sweep_configs.py | 45 +++++++- 2 files changed, 110 insertions(+), 42 deletions(-) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index e9a2195ed..aeebcfa1f 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -541,7 +541,24 @@ def get_lowest_conc(search_space_entry): return matrix_values -def generate_test_config_sweep(args, all_config_data): +def _runner_values_for_filter(runner: str, runner_data: dict, runner_node_filter: str | None) -> list[str]: + if not runner_node_filter: + return [runner] + + candidates = runner_data.get(runner, []) + if runner_node_filter in runner: + candidates = [runner, *candidates] + + matches = [] + seen = set() + for node in candidates: + if runner_node_filter in node and node not in seen: + matches.append(node) + seen.add(node) + return matches + + +def generate_test_config_sweep(args, all_config_data, runner_data=None): """Generate full sweep for specific config keys. Validates that all specified config keys exist before generating. @@ -551,6 +568,8 @@ def generate_test_config_sweep(args, all_config_data): matrix_values = [] + runner_data = runner_data or {} + for key in resolved_keys: val = all_config_data[key] is_multinode = val.get(Fields.MULTINODE.value, False) @@ -561,6 +580,10 @@ def generate_test_config_sweep(args, all_config_data): precision = val[Fields.PRECISION.value] framework = val[Fields.FRAMEWORK.value] runner = val[Fields.RUNNER.value] + runners_for_entry = _runner_values_for_filter( + runner, runner_data, getattr(args, 'runner_node_filter', None)) + if not runners_for_entry: + continue disagg = val.get(Fields.DISAGG.value, False) # Build seq-len filter if --seq-lens was provided @@ -607,25 +630,26 @@ def generate_test_config_sweep(args, all_config_data): # No intersection with requested conc values; skip continue - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.MODEL_PREFIX.value: model_code, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner, - Fields.ISL.value: isl, - Fields.OSL.value: osl, - Fields.SPEC_DECODING.value: spec_decoding, - Fields.PREFILL.value: prefill, - Fields.DECODE.value: decode, - Fields.CONC.value: conc_values, - Fields.MAX_MODEL_LEN.value: isl + osl + 256, - Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", - Fields.DISAGG.value: disagg, - Fields.RUN_EVAL.value: False, - } - matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) + for runner_value in runners_for_entry: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.CONC.value: conc_values, + Fields.MAX_MODEL_LEN.value: isl + osl + 256, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + Fields.RUN_EVAL.value: False, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) else: # Single-node config tp = bmk[Fields.TP.value] @@ -657,26 +681,27 @@ def generate_test_config_sweep(args, all_config_data): continue for conc in conc_values: - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.MODEL_PREFIX.value: model_code, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner, - Fields.ISL.value: isl, - Fields.OSL.value: osl, - Fields.TP.value: tp, - Fields.CONC.value: conc, - Fields.MAX_MODEL_LEN.value: isl + osl + 256, - Fields.EP.value: ep if ep is not None else 1, - Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, - Fields.SPEC_DECODING.value: spec_decoding, - Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", - Fields.DISAGG.value: disagg, - Fields.RUN_EVAL.value: False, - } - matrix_values.append(validate_matrix_entry(entry, is_multinode=False)) + for runner_value in runners_for_entry: + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner_value, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.TP.value: tp, + Fields.CONC.value: conc, + Fields.MAX_MODEL_LEN.value: isl + osl + 256, + Fields.EP.value: ep if ep is not None else 1, + Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + Fields.RUN_EVAL.value: False, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=False)) return matrix_values @@ -947,7 +972,7 @@ def main(): matrix_values = generate_runner_model_sweep_config( args, all_config_data, runner_data) elif args.command == 'test-config': - matrix_values = generate_test_config_sweep(args, all_config_data) + matrix_values = generate_test_config_sweep(args, all_config_data, runner_data) else: parser.error(f"Unknown command: {args.command}") diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py index a03ded47f..34bd4dc3d 100644 --- a/utils/matrix_logic/test_generate_sweep_configs.py +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -8,6 +8,7 @@ seq_len_to_str, generate_full_sweep, generate_runner_model_sweep_config, + generate_test_config_sweep, mark_eval_entries, apply_node_type_defaults, expand_config_keys, @@ -1534,6 +1535,49 @@ def full_sweep_args_both(): return args +# ============================================================================= +# Test generate_test_config_sweep +# ============================================================================= + +class TestGenerateTestConfigSweep: + """Tests for exact config-key sweep generation.""" + + def test_runner_node_filter_expands_config_runner(self, sample_multinode_config, sample_runner_config): + """test-config should allow targeting one concrete runner node.""" + args = argparse.Namespace( + config_keys=["dsr1-fp4-gb200-dynamo-trt"], + seq_lens=None, + conc=None, + runner_node_filter="gb200-nv_0", + ) + + result = generate_test_config_sweep( + args, + sample_multinode_config, + sample_runner_config, + ) + + assert len(result) == 1 + assert result[0]["runner"] == "gb200-nv_0" + + def test_runner_node_filter_no_match_skips_config(self, sample_multinode_config, sample_runner_config): + """Unmatched node filters should produce no entries.""" + args = argparse.Namespace( + config_keys=["dsr1-fp4-gb200-dynamo-trt"], + seq_lens=None, + conc=None, + runner_node_filter="gb300-nv_0", + ) + + result = generate_test_config_sweep( + args, + sample_multinode_config, + sample_runner_config, + ) + + assert result == [] + + # ============================================================================= # Test apply_node_type_defaults # ============================================================================= @@ -1885,4 +1929,3 @@ def test_prefill_entries_never_in_single_or_evals(self, mixed_entries): assert all('prefill' not in x for x in single) assert all('prefill' not in x for x in evals) - From dd67fb52d0275f7cd4f0bf1a2e854508b9163da3 Mon Sep 17 00:00:00 2001 From: Alec Flowers Date: Thu, 30 Apr 2026 16:49:01 -0700 Subject: [PATCH 07/12] fix: isolate gb300 srt setup state --- runners/launch_gb300-nv.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 2bd7ee26f..cbda51af8 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -2,7 +2,7 @@ # This script sets up the environment and launches multi-node benchmarks -set -x +set -exo pipefail export SLURM_PARTITION="batch_1" export SLURM_ACCOUNT="benchmark" @@ -58,11 +58,9 @@ export ISL="$ISL" export OSL="$OSL" echo "Cloning srt-slurm repository..." -SRT_REPO_DIR="srt-slurm" -if [ -d "$SRT_REPO_DIR" ]; then - echo "Removing existing $SRT_REPO_DIR..." - rm -rf "$SRT_REPO_DIR" -fi +RUN_KEY=$(printf "%s" "${RESULT_FILENAME:-${RUNNER_NAME:-gb300-nv}}" | sha1sum | cut -c1-12) +SRT_REPO_DIR="${GITHUB_WORKSPACE}/srt-slurm-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}" +rm -rf "$SRT_REPO_DIR" if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" @@ -81,8 +79,10 @@ export UV_INSTALL_DIR="$GITHUB_WORKSPACE/.local/bin" curl -LsSf https://astral.sh/uv/install.sh | sh export PATH="$UV_INSTALL_DIR:$PATH" -uv venv "$GITHUB_WORKSPACE/.venv" -source "$GITHUB_WORKSPACE/.venv/bin/activate" +VENV_DIR="${GITHUB_WORKSPACE}/.venv-srt-${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUN_KEY}" +rm -rf "$VENV_DIR" +uv venv "$VENV_DIR" +source "$VENV_DIR/bin/activate" uv pip install -e . if ! command -v srtctl &> /dev/null; then @@ -93,7 +93,7 @@ fi echo "Configs available at: $SRT_REPO_DIR/" # Create srtslurm.yaml for srtctl (used by both frameworks) -SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +SRTCTL_ROOT="${SRT_REPO_DIR}" echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml < Date: Thu, 30 Apr 2026 16:52:54 -0700 Subject: [PATCH 08/12] fix: remove unsupported gb300 recipe metadata --- .../deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml | 10 ---------- .../deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml | 10 ---------- .../8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml | 10 ---------- .../8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml | 10 ---------- .../8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml | 10 ---------- .../deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml | 10 ---------- 6 files changed, 60 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml index 23af4a98f..f00e62b22 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml @@ -104,13 +104,3 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml index bb72ba2fa..4e2c6b41d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml @@ -112,13 +112,3 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml index d7ea1bd31..b97ef0d5a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml @@ -120,13 +120,3 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml index adae8f9da..d83e6d771 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml @@ -120,13 +120,3 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml index 5bf03fd59..4b54cc13e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml @@ -120,13 +120,3 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml index ce962102b..ac8a35c56 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml @@ -117,13 +117,3 @@ benchmark: req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true - -identity: - model: - repo: "deepseek-ai/DeepSeek-V4-Pro" - revision: "0366e4e064385807ea86b088a5c6c878ff23343b" - container: - image: "vllm/vllm-openai:v0.20.0-ubuntu2404" - frameworks: - dynamo: "1.2.0.dev20260426" - vllm: "0.20.0" From aea8e0675782bdf9e49185ad71d4128e6802f235 Mon Sep 17 00:00:00 2001 From: hjjq <50634613+hjjq@users.noreply.github.com> Date: Thu, 30 Apr 2026 20:59:43 -0700 Subject: [PATCH 09/12] clean up --- .github/configs/nvidia-master.yaml | 6 +++--- .../deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml | 8 ++++---- .../vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml | 4 ++-- .../deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index dae5e3d34..c5917fece 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7762,7 +7762,7 @@ dsv4-fp4-gb300-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - - conc-list: [192, 256] + - conc-list: [192] prefill: num-worker: 1 tp: 4 @@ -7775,7 +7775,7 @@ dsv4-fp4-gb300-dynamo-vllm: tp: 4 ep: 1 dp-attn: false - - conc-list: [18, 36, 72] + - conc-list: [18] prefill: num-worker: 1 tp: 4 @@ -7827,7 +7827,7 @@ dsv4-fp4-gb300-dynamo-vllm: tp: 8 ep: 8 dp-attn: true - - conc-list: [2048, 3072] + - conc-list: [3072] prefill: num-worker: 7 tp: 4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml index f00e62b22..a2c3ab80a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p17d-tep4-tp4.yaml @@ -1,9 +1,9 @@ name: "svf-vllm-disagg-gb300-1p17d-tep4-tp4" # Topology: 1 prefill (TEP=4) + 17 decode (TP=4). 18 GB300 nodes (1P + 17D = 72 -# GPUs at 4 GPUs/node), NATS/etcd colocated on the prefill node. Sweeps -# concurrencies 18, 36, and 72 — wide-decode points where each decode -# worker holds a single replica. +# GPUs at 4 GPUs/node), NATS/etcd colocated on the prefill node. +# Wide-decode point at concurrency 18 — each decode worker holds a +# single replica. model: path: "deepseek-v4-pro" container: "vllm/vllm-openai:v0.20.0-ubuntu2404" @@ -100,7 +100,7 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "18x36x72" + concurrencies: "18" req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml index 4e2c6b41d..c3b25acc1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p6d-dep4-tp4.yaml @@ -2,7 +2,7 @@ name: "svf-vllm-disagg-gb300-1p6d-dep4-tp4" # Topology: 1 prefill (DEP=4) + 6 decode (TP=4). 7 GB300 nodes (1P + 6D = 28 # GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. Low-mid curve -# points at concurrencies 192 and 256. +# point at concurrency 192. model: path: "deepseek-v4-pro" container: "vllm/vllm-openai:v0.20.0-ubuntu2404" @@ -108,7 +108,7 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "192x256" + concurrencies: "192" req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml index ac8a35c56..43c2031a8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb300-7p2d-dep4-dep16.yaml @@ -2,7 +2,7 @@ name: "svf-vllm-disagg-gb300-7p2d-dep4-dep16" # Topology: 7 prefill (DEP=4) + 2 decode (DEP=16). 15 GB300 nodes (7P + 8D # = 60 GPUs at 4 GPUs/node) plus a dedicated NATS/etcd infra node. -# Wide-EP decode max-throughput sweep at concurrencies 2048 and 3072. +# Wide-EP decode max-throughput point at concurrency 3072. model: path: "deepseek-v4-pro" container: "vllm/vllm-openai:v0.20.0-ubuntu2404" @@ -113,7 +113,7 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "2048x3072" + concurrencies: "3072" req_rate: "inf" tokenizer_mode: "deepseek_v4" use_chat_template: true From f9a1cc4269992fa81de22c27dd35c2a84b0328af Mon Sep 17 00:00:00 2001 From: Alec Flowers Date: Thu, 30 Apr 2026 21:21:12 -0700 Subject: [PATCH 10/12] fix: support gb300 cw vllm launcher --- runners/launch_gb300-cw.sh | 59 ++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index eaac2a1e0..5e593ac00 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -7,12 +7,36 @@ set -x -if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # Weights staged on compute-node-local NVMe at /scratch/models/dsv4/. - # The exact upstream recipes refer to this model as `dspro`. +if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Weights staged on compute-node-local NVMe. export MODEL_PATH="/scratch/models/dsv4/" + + if [[ $FRAMEWORK == "dynamo-sglang" ]]; then + # Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers` + # (= NVIDIA/srt-slurm@9d75f82 + sa-bench parallel random prompt + # generation). The single-threaded random prompt generator in the + # upstream sa-bench dominates bench startup on the 7p1d/conc=8192 + # sweep (~50 min for the main pass alone before the first HTTP + # request leaves the client). The fork bumps that to ~1 min via + # multiprocessing.Pool with `--random-num-workers 48`. + # + # TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR + # (https://github.com/NVIDIA/srt-slurm/pull/114) merges. + SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git" + SRT_SLURM_RECIPES_REF="4249d168208ff5ff1f30b3c1158d893cc0615bb5" + SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" + SRT_RECIPE_DST="recipes/sglang/deepseek-v4" + elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then + SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git" + SRT_SLURM_RECIPES_REF="aflowers/gb200-dsv4-recipes" + SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" + SRT_RECIPE_DST="recipes/vllm/deepseek-v4" + else + echo "Unsupported framework on gb300-cw for dsv4/fp4: $FRAMEWORK. Currently supported: dynamo-sglang, dynamo-vllm" + exit 1 + fi else - echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang" + echo "Unsupported model prefix/precision combination on gb300-cw: $MODEL_PREFIX/$PRECISION. Currently supported: dsv4/fp4" exit 1 fi @@ -32,18 +56,6 @@ export NVIDIA_VISIBLE_DEVICES=all export NVIDIA_DRIVER_CAPABILITIES=compute,utility NGINX_IMAGE="nginx:1.27.4" -# Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers` -# (= NVIDIA/srt-slurm@9d75f82 + sa-bench parallel random prompt -# generation). The single-threaded random prompt generator in the -# upstream sa-bench dominates bench startup on the 7p1d/conc=8192 -# sweep (~50 min for the main pass alone before the first HTTP -# request leaves the client). The fork bumps that to ~1 min via -# multiprocessing.Pool with `--random-num-workers 48`. -# -# TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR -# (https://github.com/NVIDIA/srt-slurm/pull/114) merges. -SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git" -SRT_SLURM_RECIPES_COMMIT="4249d168208ff5ff1f30b3c1158d893cc0615bb5" # Squash files live alongside models on /mnt/vast (shared across nodes). # `squash_dupe` instead of `squash` to use '_'-separated names: srtctl / @@ -102,15 +114,11 @@ fi git clone "$SRT_SLURM_RECIPES_REPO" "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" -git checkout "$SRT_SLURM_RECIPES_COMMIT" +git checkout "$SRT_SLURM_RECIPES_REF" -# Overlay the hand-rolled DSV4 sglang recipes onto the upstream srt-slurm -# checkout. Mirrors launch_gb200-nv.sh's dynamo-sglang dsv4 branch: -# destination must be `recipes/sglang/deepseek-v4` because -# `additional-settings: CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/...` -# in `.github/configs/nvidia-master.yaml` is what srtctl loads. -mkdir -p recipes/sglang/deepseek-v4 -cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 +# Overlay the hand-rolled DSV4 recipes onto the selected srt-slurm checkout. +mkdir -p "$SRT_RECIPE_DST" +cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST" echo "Installing srtctl..." # CRITICAL — uv install location. @@ -166,7 +174,7 @@ mkdir -p configs/dynamo-wheels echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml < Date: Thu, 30 Apr 2026 23:59:24 -0500 Subject: [PATCH 11/12] gb300-cw: import squash files via srun under flock Mirror the launch_gb300-nv.sh pattern: compute nodes are aarch64, the runner pod is x86_64, so dispatch `enroot import` to a compute node via srun rather than relying on a manually-staged sqsh. flock on the shared VAST lock file serializes parallel cw_0/1/2/3 jobs. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_gb300-cw.sh | 44 +++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 5e593ac00..413df8d8e 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -63,33 +63,29 @@ NGINX_IMAGE="nginx:1.27.4" # old /mnt/vast/squash dir contains '+'-separated files from prior runs. SQUASH_DIR="/mnt/vast/squash_dupe" mkdir -p "$SQUASH_DIR" -# Compute nodes (slurm-gb300-138-*, slurm-gb300-139-*) are aarch64; the -# image `lmsysorg/sglang:deepseek-v4-grace-blackwell` is published as -# arm64-only. The CI runner pod is x86_64 and (a) cannot run -# `enroot import` for the arm64 manifest because `enroot-aufs2ovlfs` -# needs CAP_SYS_ADMIN that the pod lacks ("Operation not permitted"), -# and (b) even with `--arch aarch64` the conversion still fails on x86. -# Per `https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780` -# the import has to be dispatched to an arm64 compute node via srun. -# To keep CI self-contained we instead pin to the pre-staged arm64 sqsh -# under /mnt/vast/squash_dupe/ (refreshed manually by running that gist -# script when the docker tag is updated). Filename suffix `_arm64` -# distinguishes the working arm64 sqsh from any stale amd64 shadow. SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh" NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh" -if [[ ! -f "$SQUASH_FILE" ]]; then - echo "ERROR: pre-staged arm64 sqsh missing: $SQUASH_FILE" >&2 - echo "Refresh it on a GB300 compute node via the script in the gist:" >&2 - echo " https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780" >&2 - exit 1 -fi -if [[ ! -f "$NGINX_SQUASH_FILE" ]]; then - echo "ERROR: pre-staged arm64 nginx sqsh missing: $NGINX_SQUASH_FILE" >&2 - echo "Run on an aarch64 host:" >&2 - echo " enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" >&2 - exit 1 -fi +# Run the import on a compute node via srun, not on the runner pod: +# the runner pod is x86_64 while the compute nodes are aarch64, so the +# arm64 squash file has to be built on a compute node. +import_squash() { + local squash="$1" image="$2" + local lock="${squash}.lock" + srun --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --exclusive --time=180 bash -c " + exec 9>\"$lock\" + flock -w 600 9 || { echo 'Failed to acquire lock for $squash' >&2; exit 1; } + if unsquashfs -l \"$squash\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import: $squash' + else + rm -f \"$squash\" + enroot import -o \"$squash\" docker://$image + fi + " +} + +import_squash "$SQUASH_FILE" "$IMAGE" +import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE" export EVAL_ONLY="${EVAL_ONLY:-false}" From 42e339ee86860a0e37c0932766496d67df7f1994 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 1 May 2026 00:13:08 -0500 Subject: [PATCH 12/12] Pin dsv4-fp4-gb300-dynamo-vllm to gb300-nv runners The cw runners hit a srtctl version mismatch on the dynamo-vllm srt-slurm pin (aflowers/gb200-dsv4-recipes rejects the default_bash_preamble field, dropping the model_paths block). Route this config to the nv runners until the cw srtctl pin is bumped. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e5776cba7..8a80b3fb3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7834,7 +7834,7 @@ dsv4-fp4-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.0-ubuntu2404 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: gb300 + runner: gb300-nv precision: fp4 framework: dynamo-vllm multinode: true