From dbb221882dd76eb8e7662a51cab87544ad3796b2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 23 Apr 2026 21:12:11 -0700 Subject: [PATCH 01/28] Add DeepSeek V4 Flash FP4 GB200 disaggregated vLLM benchmarks via Dynamo Adapts the kimi-k2.5 dynamo-vllm 8k1k 5p1d-dep4-dep8 recipe to DeepSeek V4 Flash. Recipes live under srt-slurm-recipes/ and are copied into the srt-slurm checkout at runtime since the upstream NVIDIA/srt-slurm repo doesn't ship DSV4 recipes. - New config key: dsv4-fp4-gb200-dynamo-vllm - Image: vllm/vllm-openai:deepseekv4-cu130 - Model: deepseek-ai/DeepSeek-V4-Flash - Model path on cluster: /mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash --- .github/configs/nvidia-master.yaml | 27 +++++ perf-changelog.yaml | 8 ++ runners/launch_gb200-nv.sh | 12 ++- .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml | 101 ++++++++++++++++++ 4 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ec9cbc11e..76da9e7d7 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7428,3 +7428,30 @@ kimik2.5-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + +dsv4-fp4-gb200-dynamo-vllm: + image: vllm/vllm-openai:deepseekv4-cu130 + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [2048] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ddc6409c2..1445ad3c7 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,11 @@ +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Add DeepSeek V4 Flash FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 5p1d)" + - "Container: vllm/vllm-openai:deepseekv4-cu130" + - "Recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index b746e4a24..cb0396421 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -42,8 +42,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then if [[ $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" + elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash" else - echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4" + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4" exit 1 fi else @@ -134,7 +137,12 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -if [[ $FRAMEWORK == "dynamo-vllm" ]]; then +if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 + cp -r "$GITHUB_WORKSPACE/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 +elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml new file mode 100644 index 000000000..6f5db6e49 --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml @@ -0,0 +1,101 @@ +name: "dsv4-vllm-disagg-gb200-5p1d-dep4-dep8" + +model: + path: "deepseek-v4-flash" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + version: 1.0.1 + install: true + +setup_script: vllm-container-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 5 + decode_nodes: 2 + prefill_workers: 5 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + decode_environment: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Flash" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 64 + enforce-eager: true + compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + all2all-backend: "flashinfer_nvlink_one_sided" + gpu-memory-utilization: 0.9 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Flash" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 10240 + max-num-seqs: 512 + max-num-batched-tokens: 10240 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-chunked-prefill: true + async-scheduling: true + attention-backend: "FLASHINFER_MLA" + block-size: 64 + all2all-backend: "flashinfer_nvlink_one_sided" + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + max-cudagraph-capture-size: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" From 1bb849472caed1897592777ae7ff09dd5bd0fdb1 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 23 Apr 2026 21:24:46 -0700 Subject: [PATCH 02/28] flags --- .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml index 6f5db6e49..21fe86970 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml @@ -55,18 +55,21 @@ backend: enable-expert-parallel: true max-model-len: 10240 max-num-seqs: 64 - enforce-eager: true - compilation-config: '{"custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' max-num-batched-tokens: 16384 safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true no-enable-chunked-prefill: true attention-backend: "FLASHINFER_MLA" - block-size: 64 - attention-config: '{"use_trtllm_ragged_deepseek_prefill": true}' + block-size: 256 + attention-config: '{"use_trtllm_ragged_deepseek_prefill": true, "use_fp4_indexer_cache": true}' all2all-backend: "flashinfer_nvlink_one_sided" gpu-memory-utilization: 0.9 + tokenizer-mode: "deepseek_v4" + tool-call-parser: "deepseek_v4" + enable-auto-tool-choice: true + reasoning-parser: "deepseek_v4" decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -86,12 +89,17 @@ backend: no-enable-chunked-prefill: true async-scheduling: true attention-backend: "FLASHINFER_MLA" - block-size: 64 + block-size: 256 + attention-config: '{"use_fp4_indexer_cache": true}' all2all-backend: "flashinfer_nvlink_one_sided" - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8","+rms_norm","+rotary_embedding"],"pass_config":{"fuse_attn_quant":true,"fuse_allreduce_rms":true}}' + compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' gpu-memory-utilization: 0.9 stream-interval: 50 max-cudagraph-capture-size: 512 + tokenizer-mode: "deepseek_v4" + tool-call-parser: "deepseek_v4" + enable-auto-tool-choice: true + reasoning-parser: "deepseek_v4" benchmark: type: "sa-bench" From 41e71b833c712b4dcc392e3f096c054d77bf86a4 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 23 Apr 2026 21:27:58 -0700 Subject: [PATCH 03/28] import --- .../vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml index 21fe86970..a98e63480 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml @@ -6,7 +6,10 @@ model: precision: "fp4" dynamo: - version: 1.0.1 + # Source install pinned to the first commit that fixes the vllm.inputs restructure. + # v1.0.1 / v1.0.2 / v1.1.0-dev.* on PyPI still import from vllm.inputs.data, which + # was removed in the vLLM build inside vllm/vllm-openai:deepseekv4-cu130. + hash: d5803cbe71c0035a725652373a175f01942c4a33 install: true setup_script: vllm-container-deps.sh From 4854a7a0db0f10c2dfbd906dd382ab48d03f4a04 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 23 Apr 2026 21:42:03 -0700 Subject: [PATCH 04/28] flags --- .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml index a98e63480..18450501a 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml @@ -27,6 +27,16 @@ resources: frontend: type: dynamo enable_multiple_frontends: false + # --dyn-chat-processor vllm routes OpenAI pre/post-processing through vLLM's + # FrontendArgs, which is what recognises --tool-call-parser, + # --enable-auto-tool-choice, and --reasoning-parser. In a Dynamo disagg split + # these are server-layer concerns: the workers (dynamo.vllm) only accept + # AsyncEngineArgs. + args: + dyn-chat-processor: "vllm" + tool-call-parser: "deepseek_v4" + enable-auto-tool-choice: true + reasoning-parser: "deepseek_v4" backend: type: vllm @@ -70,9 +80,6 @@ backend: all2all-backend: "flashinfer_nvlink_one_sided" gpu-memory-utilization: 0.9 tokenizer-mode: "deepseek_v4" - tool-call-parser: "deepseek_v4" - enable-auto-tool-choice: true - reasoning-parser: "deepseek_v4" decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -100,9 +107,6 @@ backend: stream-interval: 50 max-cudagraph-capture-size: 512 tokenizer-mode: "deepseek_v4" - tool-call-parser: "deepseek_v4" - enable-auto-tool-choice: true - reasoning-parser: "deepseek_v4" benchmark: type: "sa-bench" From ac030e6d315a073db0f369f28633875b6211123c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 23 Apr 2026 22:02:12 -0700 Subject: [PATCH 05/28] recipe change --- .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml | 62 ++++++++----------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml index 18450501a..9d727f400 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml @@ -1,14 +1,19 @@ name: "dsv4-vllm-disagg-gb200-5p1d-dep4-dep8" +# Adapted from NVIDIA/srt-slurm PR #67 (deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16). +# Changes: +# * DeepSeek-V4-Flash instead of Pro (smaller model, same arch) +# * 5p1d-dep4-dep8 topology instead of 7p1d-dep8-dep16 +# * dynamo source-install pinned to the vllm.inputs.data fix commit (v1.0.2 +# on PyPI still imports from vllm.inputs.data, which the vLLM in +# vllm/vllm-openai:deepseekv4-cu130 no longer exposes) + model: path: "deepseek-v4-flash" container: "vllm/vllm-openai:deepseekv4-cu130" precision: "fp4" dynamo: - # Source install pinned to the first commit that fixes the vllm.inputs restructure. - # v1.0.1 / v1.0.2 / v1.1.0-dev.* on PyPI still import from vllm.inputs.data, which - # was removed in the vLLM build inside vllm/vllm-openai:deepseekv4-cu130. hash: d5803cbe71c0035a725652373a175f01942c4a33 install: true @@ -27,34 +32,26 @@ resources: frontend: type: dynamo enable_multiple_frontends: false - # --dyn-chat-processor vllm routes OpenAI pre/post-processing through vLLM's - # FrontendArgs, which is what recognises --tool-call-parser, - # --enable-auto-tool-choice, and --reasoning-parser. In a Dynamo disagg split - # these are server-layer concerns: the workers (dynamo.vllm) only accept - # AsyncEngineArgs. - args: - dyn-chat-processor: "vllm" - tool-call-parser: "deepseek_v4" - enable-auto-tool-choice: true - reasoning-parser: "deepseek_v4" backend: type: vllm connector: null prefill_environment: - VLLM_USE_FLASHINFER_MOE_FP4: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" decode_environment: - VLLM_USE_FLASHINFER_MOE_FP4: "1" + TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" vllm_config: prefill: @@ -66,20 +63,16 @@ backend: data-parallel-size: 4 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: 10240 - max-num-seqs: 64 - compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + enforce-eager: true + max-model-len: auto + max-num-seqs: 2 max-num-batched-tokens: 16384 - safetensors-load-strategy: "prefetch" trust-remote-code: true no-enable-prefix-caching: true - no-enable-chunked-prefill: true - attention-backend: "FLASHINFER_MLA" block-size: 256 - attention-config: '{"use_trtllm_ragged_deepseek_prefill": true, "use_fp4_indexer_cache": true}' - all2all-backend: "flashinfer_nvlink_one_sided" - gpu-memory-utilization: 0.9 - tokenizer-mode: "deepseek_v4" + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' @@ -90,23 +83,18 @@ backend: data-parallel-size: 8 data-parallel-rpc-port: 13345 enable-expert-parallel: true - max-model-len: 10240 - max-num-seqs: 512 - max-num-batched-tokens: 10240 - safetensors-load-strategy: "prefetch" + max-model-len: auto + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 trust-remote-code: true no-enable-prefix-caching: true - no-enable-chunked-prefill: true - async-scheduling: true - attention-backend: "FLASHINFER_MLA" block-size: 256 - attention-config: '{"use_fp4_indexer_cache": true}' - all2all-backend: "flashinfer_nvlink_one_sided" - compilation-config: '{"cudagraph_mode":"FULL_AND_PIECEWISE","custom_ops":["all"]}' + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' gpu-memory-utilization: 0.9 stream-interval: 50 - max-cudagraph-capture-size: 512 - tokenizer-mode: "deepseek_v4" + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true benchmark: type: "sa-bench" From b592c60b2969bfed3e4dcc0a4e2674362cdc81b6 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 23 Apr 2026 22:26:30 -0700 Subject: [PATCH 06/28] prompt --- .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml | 2 ++ .../vllm/deepseek-v4/dsv4-chat-template.jinja | 32 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml index 9d727f400..fa546761c 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml @@ -44,6 +44,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" + DYN_CUSTOM_JINJA_TEMPLATE: "/infmax-workspace/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" @@ -52,6 +53,7 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" + DYN_CUSTOM_JINJA_TEMPLATE: "/infmax-workspace/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja" vllm_config: prefill: diff --git a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja new file mode 100644 index 000000000..e684deee8 --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja @@ -0,0 +1,32 @@ +{#- Minimal DeepSeek-V4 chat template. + +DSV4 does not ship a Jinja chat_template; HuggingFace provides only the +Python `encoding_dsv4` helper. Dynamo's frontend still requires a +chat_template at startup (PromptFormatter.from_mdc), so we register this +file via --custom-jinja-template. + +This template is a best-effort DeepSeek-style formatter: delimiters +mirror DeepSeek-V3 (<|User|>, <|Assistant|>, <|end_of_sentence|>) and it +renders `reasoning_content` wrapped in ... so Dynamo's +`template_handles_reasoning` detection fires (avoids double-injection). + +sa-bench throughput runs use /v1/completions (raw prompts), so this +template is not exercised during benchmarking. If eval or chat-style +workloads are added later, replace this with a validated template +derived from deepseek-ai/DeepSeek-V4-*/encoding_dsv4.py. -#} +{%- for message in messages -%} + {%- if message['role'] == 'system' -%} + {{ message['content'] }} + {%- elif message['role'] == 'user' -%} + <|User|>{{ message['content'] }} + {%- elif message['role'] == 'assistant' -%} + <|Assistant|> + {%- if message.get('reasoning_content') -%} + {{ message['reasoning_content'] }} + {%- endif -%} + {{ message['content'] }}<|end_of_sentence|> + {%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} +<|Assistant|> +{%- endif -%} From 11a4c08f78e25716a7f5d12780e63fe52fda4deb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 23 Apr 2026 23:00:36 -0700 Subject: [PATCH 07/28] prompt --- .github/configs/nvidia-master.yaml | 2 +- runners/launch_gb200-nv.sh | 4 +- .../8k1k/disagg-gb200-5p1d-dep4-dep8.yaml | 16 +++-- .../vllm/deepseek-v4/dsv4-chat-template.jinja | 65 ++++++++++++------- 4 files changed, 58 insertions(+), 29 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 76da9e7d7..a6e0c3ce3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7431,7 +7431,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dsv4-fp4-gb200-dynamo-vllm: image: vllm/vllm-openai:deepseekv4-cu130 - model: deepseek-ai/DeepSeek-V4-Flash + model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 precision: fp4 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index cb0396421..d7d0271e7 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -43,8 +43,8 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Flash" - export SRT_SLURM_MODEL_PREFIX="deepseek-v4-flash" + export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Pro" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4" exit 1 diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml index fa546761c..afba21415 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml @@ -2,14 +2,15 @@ name: "dsv4-vllm-disagg-gb200-5p1d-dep4-dep8" # Adapted from NVIDIA/srt-slurm PR #67 (deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16). # Changes: -# * DeepSeek-V4-Flash instead of Pro (smaller model, same arch) # * 5p1d-dep4-dep8 topology instead of 7p1d-dep8-dep16 # * dynamo source-install pinned to the vllm.inputs.data fix commit (v1.0.2 # on PyPI still imports from vllm.inputs.data, which the vLLM in # vllm/vllm-openai:deepseekv4-cu130 no longer exposes) +# * DYN_CUSTOM_JINJA_TEMPLATE points at a derived-from-encoding_dsv4 template +# since DSV4 ships no Jinja chat_template model: - path: "deepseek-v4-flash" + path: "deepseek-v4-pro" container: "vllm/vllm-openai:deepseekv4-cu130" precision: "fp4" @@ -58,7 +59,7 @@ backend: vllm_config: prefill: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Flash" + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 @@ -78,7 +79,7 @@ backend: decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Flash" + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 @@ -104,3 +105,10 @@ benchmark: osl: 1024 concurrencies: "2048" req_rate: "inf" + # DSV4's HF tokenizer ships no chat_template (README: "This release does not + # include a Jinja-format chat template"). sa-bench's --use-chat-template + # path calls tokenizer.apply_chat_template() directly on the HF tokenizer, + # which raises ValueError. Send raw random tokens via /v1/completions + # instead — correct for throughput benchmarking and matches sa-bench's + # warmup path that already succeeded in prior runs. + use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja index e684deee8..b2e8a5f37 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja +++ b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja @@ -1,32 +1,53 @@ -{#- Minimal DeepSeek-V4 chat template. +{#- DeepSeek-V4 chat template. -DSV4 does not ship a Jinja chat_template; HuggingFace provides only the -Python `encoding_dsv4` helper. Dynamo's frontend still requires a -chat_template at startup (PromptFormatter.from_mdc), so we register this -file via --custom-jinja-template. +Derived from the reference encoding_dsv4 at +https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/tree/main/encoding +and README.md (README Quick Start): -This template is a best-effort DeepSeek-style formatter: delimiters -mirror DeepSeek-V3 (<|User|>, <|Assistant|>, <|end_of_sentence|>) and it -renders `reasoning_content` wrapped in ... so Dynamo's -`template_handles_reasoning` detection fires (avoids double-injection). + <|begin▁of▁sentence|>{system} + <|User|>{message}<|Assistant|>{reasoning}{response}<|end▁of▁sentence|> -sa-bench throughput runs use /v1/completions (raw prompts), so this -template is not exercised during benchmarking. If eval or chat-style -workloads are added later, replace this with a validated template -derived from deepseek-ai/DeepSeek-V4-*/encoding_dsv4.py. -#} -{%- for message in messages -%} - {%- if message['role'] == 'system' -%} - {{ message['content'] }} - {%- elif message['role'] == 'user' -%} - <|User|>{{ message['content'] }} +Format rules implemented: + * BOS <|begin▁of▁sentence|> once at the start, immediately followed by + the system prompt inline (no role wrapper). + * User turn: <|User|>{content} + * Assistant turn (thinking mode — DSV4 default): + <|Assistant|>{reasoning_content}{content}<|end▁of▁sentence|> + Assistant turn (chat mode, no reasoning): + <|Assistant|>{content}<|end▁of▁sentence|> + (chat mode opens an empty thinking block, per the README.) + * add_generation_prompt: <|Assistant|> (thinking mode default) + +Tool calls and the developer / latest_reminder / quick-instruction roles +from encoding_dsv4.py are NOT implemented here. sa-bench throughput runs +use /v1/completions so this template is only evaluated at frontend +startup (PromptFormatter.from_mdc); it is not invoked per-request. If +eval via /v1/chat/completions is added, expand this template to match +encoding_dsv4.py (DSML tool-call format, drop_thinking semantics, etc.). +-#} +{%- if messages and messages[0]['role'] == 'system' -%} + {%- set system_content = messages[0]['content'] -%} + {%- set loop_messages = messages[1:] -%} +{%- else -%} + {%- set system_content = '' -%} + {%- set loop_messages = messages -%} +{%- endif -%} +<|begin▁of▁sentence|>{{ system_content }} +{%- for message in loop_messages -%} + {%- if message['role'] == 'user' -%} +<|User|>{{ message['content'] }} {%- elif message['role'] == 'assistant' -%} - <|Assistant|> +<|Assistant|> {%- if message.get('reasoning_content') -%} - {{ message['reasoning_content'] }} +{{ message['reasoning_content'] }} + {%- else -%} + {%- endif -%} - {{ message['content'] }}<|end_of_sentence|> +{{ message['content'] }}<|end▁of▁sentence|> + {%- elif message['role'] == 'tool' -%} +<|User|>{{ message['content'] }} {%- endif -%} {%- endfor -%} {%- if add_generation_prompt -%} -<|Assistant|> +<|Assistant|> {%- endif -%} From 9359fe8dc54c66c5c6f35966080883a16db17938 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Thu, 23 Apr 2026 23:19:04 -0700 Subject: [PATCH 08/28] prompt --- .github/configs/nvidia-master.yaml | 14 ++--- perf-changelog.yaml | 6 +-- ...yaml => disagg-gb200-7p1d-dep8-dep16.yaml} | 48 ++++++++--------- .../vllm/deepseek-v4/dsv4-chat-template.jinja | 53 ------------------- 4 files changed, 32 insertions(+), 89 deletions(-) rename srt-slurm-recipes/vllm/deepseek-v4/8k1k/{disagg-gb200-5p1d-dep4-dep8.yaml => disagg-gb200-7p1d-dep8-dep16.yaml} (61%) delete mode 100644 srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a6e0c3ce3..25c312ddf 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7442,16 +7442,16 @@ dsv4-fp4-gb200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - - conc-list: [2048] + - conc-list: [4096] prefill: - num-worker: 5 - tp: 4 - ep: 4 + num-worker: 7 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" decode: num-worker: 1 - tp: 8 - ep: 8 + tp: 16 + ep: 16 dp-attn: true diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1445ad3c7..d028d4457 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,10 +1,10 @@ - config-keys: - dsv4-fp4-gb200-dynamo-vllm description: - - "Add DeepSeek V4 Flash FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 5p1d)" + - "Add DeepSeek V4 Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 7p1d-dep8-dep16)" - "Container: vllm/vllm-openai:deepseekv4-cu130" - - "Recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + - "Mirrors NVIDIA/srt-slurm PR #67; recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 - config-keys: - dsr1-fp8-h100-dynamo-trt diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml similarity index 61% rename from srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml rename to srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index afba21415..d97d1de9d 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-5p1d-dep4-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -1,13 +1,16 @@ -name: "dsv4-vllm-disagg-gb200-5p1d-dep4-dep8" +name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16" -# Adapted from NVIDIA/srt-slurm PR #67 (deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16). -# Changes: -# * 5p1d-dep4-dep8 topology instead of 7p1d-dep8-dep16 -# * dynamo source-install pinned to the vllm.inputs.data fix commit (v1.0.2 -# on PyPI still imports from vllm.inputs.data, which the vLLM in -# vllm/vllm-openai:deepseekv4-cu130 no longer exposes) -# * DYN_CUSTOM_JINJA_TEMPLATE points at a derived-from-encoding_dsv4 template -# since DSV4 ships no Jinja chat_template +# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra +# benchmark flag: use_chat_template=false. The HF tokenizer for +# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's +# --use-chat-template path calls tokenizer.apply_chat_template() and raises +# ValueError. Throughput benchmarking uses /v1/completions with random tokens +# anyway — no chat template needed. +# +# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a +# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/ +# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and +# uses this native formatter — no custom Jinja template required. model: path: "deepseek-v4-pro" @@ -15,7 +18,7 @@ model: precision: "fp4" dynamo: - hash: d5803cbe71c0035a725652373a175f01942c4a33 + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b install: true setup_script: vllm-container-deps.sh @@ -23,12 +26,12 @@ setup_script: vllm-container-deps.sh resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 5 - decode_nodes: 2 - prefill_workers: 5 + prefill_nodes: 14 + decode_nodes: 4 + prefill_workers: 7 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 8 + gpus_per_prefill: 8 + gpus_per_decode: 16 frontend: type: dynamo @@ -45,7 +48,6 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - DYN_CUSTOM_JINJA_TEMPLATE: "/infmax-workspace/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja" decode_environment: TILELANG_CLEANUP_TEMP_FILES: "1" @@ -54,7 +56,6 @@ backend: NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" - DYN_CUSTOM_JINJA_TEMPLATE: "/infmax-workspace/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja" vllm_config: prefill: @@ -63,7 +64,7 @@ backend: kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 4 + data-parallel-size: 8 data-parallel-rpc-port: 13345 enable-expert-parallel: true enforce-eager: true @@ -72,6 +73,7 @@ backend: max-num-batched-tokens: 16384 trust-remote-code: true no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true block-size: 256 gpu-memory-utilization: 0.88 no-disable-hybrid-kv-cache-manager: true @@ -83,7 +85,7 @@ backend: kv-cache-dtype: "fp8" tensor-parallel-size: 1 pipeline-parallel-size: 1 - data-parallel-size: 8 + data-parallel-size: 16 data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: auto @@ -103,12 +105,6 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "2048" + concurrencies: "4096" req_rate: "inf" - # DSV4's HF tokenizer ships no chat_template (README: "This release does not - # include a Jinja-format chat template"). sa-bench's --use-chat-template - # path calls tokenizer.apply_chat_template() directly on the HF tokenizer, - # which raises ValueError. Send raw random tokens via /v1/completions - # instead — correct for throughput benchmarking and matches sa-bench's - # warmup path that already succeeded in prior runs. use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja b/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja deleted file mode 100644 index b2e8a5f37..000000000 --- a/srt-slurm-recipes/vllm/deepseek-v4/dsv4-chat-template.jinja +++ /dev/null @@ -1,53 +0,0 @@ -{#- DeepSeek-V4 chat template. - -Derived from the reference encoding_dsv4 at -https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/tree/main/encoding -and README.md (README Quick Start): - - <|begin▁of▁sentence|>{system} - <|User|>{message}<|Assistant|>{reasoning}{response}<|end▁of▁sentence|> - -Format rules implemented: - * BOS <|begin▁of▁sentence|> once at the start, immediately followed by - the system prompt inline (no role wrapper). - * User turn: <|User|>{content} - * Assistant turn (thinking mode — DSV4 default): - <|Assistant|>{reasoning_content}{content}<|end▁of▁sentence|> - Assistant turn (chat mode, no reasoning): - <|Assistant|>{content}<|end▁of▁sentence|> - (chat mode opens an empty thinking block, per the README.) - * add_generation_prompt: <|Assistant|> (thinking mode default) - -Tool calls and the developer / latest_reminder / quick-instruction roles -from encoding_dsv4.py are NOT implemented here. sa-bench throughput runs -use /v1/completions so this template is only evaluated at frontend -startup (PromptFormatter.from_mdc); it is not invoked per-request. If -eval via /v1/chat/completions is added, expand this template to match -encoding_dsv4.py (DSML tool-call format, drop_thinking semantics, etc.). --#} -{%- if messages and messages[0]['role'] == 'system' -%} - {%- set system_content = messages[0]['content'] -%} - {%- set loop_messages = messages[1:] -%} -{%- else -%} - {%- set system_content = '' -%} - {%- set loop_messages = messages -%} -{%- endif -%} -<|begin▁of▁sentence|>{{ system_content }} -{%- for message in loop_messages -%} - {%- if message['role'] == 'user' -%} -<|User|>{{ message['content'] }} - {%- elif message['role'] == 'assistant' -%} -<|Assistant|> - {%- if message.get('reasoning_content') -%} -{{ message['reasoning_content'] }} - {%- else -%} - - {%- endif -%} -{{ message['content'] }}<|end▁of▁sentence|> - {%- elif message['role'] == 'tool' -%} -<|User|>{{ message['content'] }} - {%- endif -%} -{%- endfor -%} -{%- if add_generation_prompt -%} -<|Assistant|> -{%- endif -%} From 1d51ba1eb6372886de54b24ae2066a8216ab5a5d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 00:35:46 -0700 Subject: [PATCH 09/28] weight loading --- runners/launch_gb200-nv.sh | 38 +++++++++++++++++++ .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 8 ++++ 2 files changed, 46 insertions(+) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index d7d0271e7..db8bf4d4b 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -57,6 +57,44 @@ fi export SLURM_PARTITION="batch" export SLURM_ACCOUNT="benchmark" +# ---- DSV4 weight pre-stage to compute-node-local NVMe ---- +# DSV4-Pro (~850 GB FP4+FP8 weights) loads too slowly from Lustre: 14 prefill +# workers contending for the same OSTs stretches the load past srtctl's +# health-check deadline. Stage once onto /mnt/numa0 (14T local NVMe RAID per +# compute node) via srun across all 18 batch-partition nodes before launching +# srtctl. Subsequent runs hit the local copy and skip the rsync via the +# .stage-complete marker. +if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then + LUSTRE_SRC="$MODEL_PATH" + STAGED_MODEL_PATH="/mnt/numa0/cache/deepseek-v4-pro" + STAGE_MARKER="$STAGED_MODEL_PATH/.stage-complete" + # Total node count == prefill_nodes + decode_nodes from the recipe (7p1d-dep8-dep16 = 14+4) + STAGE_NODES=18 + + echo "Pre-staging DSV4 weights $LUSTRE_SRC -> $STAGED_MODEL_PATH on $STAGE_NODES nodes..." + if srun --account="$SLURM_ACCOUNT" --partition="$SLURM_PARTITION" \ + --nodes="$STAGE_NODES" --ntasks-per-node=1 \ + --time=40:00 --job-name=dsv4-prestage --exclusive \ + bash -c ' + set -e + host=$(hostname) + if [ -f "'"$STAGE_MARKER"'" ]; then + echo "[$host] already staged, skipping" + exit 0 + fi + mkdir -p "'"$STAGED_MODEL_PATH"'" + echo "[$host] rsync start: $(date -u +%H:%M:%S)" + time rsync -a --whole-file --info=stats2 "'"$LUSTRE_SRC"'/" "'"$STAGED_MODEL_PATH"'/" + touch "'"$STAGE_MARKER"'" + echo "[$host] rsync done: $(date -u +%H:%M:%S)" + '; then + echo "Pre-stage complete; pointing MODEL_PATH at local copy" + export MODEL_PATH="$STAGED_MODEL_PATH" + else + echo "WARNING: pre-stage failed (srun exit $?); falling back to Lustre MODEL_PATH=$LUSTRE_SRC" + fi +fi + NGINX_IMAGE="nginx:1.27.4" SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index d97d1de9d..1e96ed90d 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -23,6 +23,14 @@ dynamo: setup_script: vllm-container-deps.sh +# Bump health-check from the 1800s default to 2 hours. DSV4-Pro (~850 GB +# FP4+FP8 weights) loads off Lustre slowly on a cold cache — observed +# ~33 min for 64 safetensor shards with 14 prefill workers contending for +# the same OSTs. 1800s isn't enough; 7200s gives headroom. +health_check: + max_attempts: 720 + interval_seconds: 10 + resources: gpu_type: "gb200" gpus_per_node: 4 From 4ce52cd06ab388274bcfe5b8387a2ffe57de5c3c Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 02:11:12 -0700 Subject: [PATCH 10/28] sweep --- .github/configs/nvidia-master.yaml | 33 +++++- .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 109 ++++++++++++++++++ .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 109 ++++++++++++++++++ .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 +- 4 files changed, 251 insertions(+), 2 deletions(-) create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 25c312ddf..4e8def37f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7442,7 +7442,38 @@ dsv4-fp4-gb200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - - conc-list: [4096] + # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each). + # 10 nodes total. Low TTFT/TPOT focus. + - conc-list: [4, 8, 16, 32, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: true + # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes + # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. + - conc-list: [2048, 4096] prefill: num-worker: 7 tp: 8 diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml new file mode 100644 index 000000000..98f613adf --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -0,0 +1,109 @@ +name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8" + +# Interactivity-focused topology: 1 prefill worker + 4 separate decode +# workers, each at DP=8. Targets conc 4-128 where TTFT/TPOT matter more +# than aggregate throughput. Same per-worker vllm_config as the NVIDIA +# 7p1d reference (PR #67); only resources, decode capacity (max-num-seqs +# / cudagraph capture / batched-tokens), and benchmark concurrencies +# differ. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: auto + max-num-seqs: 2 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: auto + max-num-seqs: 32 + max-cudagraph-capture-size: 32 + max-num-batched-tokens: 32 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x128" + req_rate: "inf" + use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..4c59e5a73 --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -0,0 +1,109 @@ +name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" + +# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single +# wide decode (DP=16). Targets conc 512-1024 where a single big decode +# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d +# reference (PR #67); only resources, prefill_workers count, and +# benchmark concurrencies differ. Decode capacity matches 7p1d +# (max-num-seqs=256) since the decode topology itself is identical. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: auto + max-num-seqs: 2 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: auto + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" + req_rate: "inf" + use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 1e96ed90d..318362ef1 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -113,6 +113,6 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4096" + concurrencies: "2048x4096" req_rate: "inf" use_chat_template: false From 071643b75d115771cb657429f55c943dd68e1961 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 09:31:58 -0700 Subject: [PATCH 11/28] Add 1k/1k DSV4-Pro recipes, comment out 8k/1k for now MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two 1k/1k vLLM disagg recipes extrapolated from kimi-k2.5/1k1k (scaled to DSV4-Pro's DP>=8-per-worker constraint): * disagg-gb200-1p4d-dep8-dep8.yaml — interactivity (conc 4-128), 10 nodes * disagg-gb200-1p1d-dep8-dep16.yaml — mid/high throughput (conc 256-4096), 6 nodes Per-recipe tuning vs our 8k/1k baseline: * max-model-len 3072 (matches kimi 1k/1k) * prefill max-num-seqs 16 (fills 16384-token budget at 1k per seq) * decode max-num-seqs 128/512 (shorter KV -> more parallelism) nvidia-master.yaml changes: * Adds the 1k/1k seq-len-config with conc-lists stripped of 4/16/32 * Comments out the entire 8k/1k block so sweep-enabled runs don't re-trigger 8k/1k while 1k/1k numbers are collected. Re-enable by uncommenting (instructions at the top of the block). --- .github/configs/nvidia-master.yaml | 91 ++++++++++---- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 115 ++++++++++++++++++ .../1k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 114 +++++++++++++++++ 3 files changed, 297 insertions(+), 23 deletions(-) create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4e8def37f..282dcf85d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7439,50 +7439,95 @@ dsv4-fp4-gb200-dynamo-vllm: multinode: true disagg: true seq-len-configs: - - isl: 8192 + # 1k/1k — extrapolated from kimi-k2.5 1k/1k topologies, scaled to DSV4-Pro's + # DP>=8 constraint. No upstream NVIDIA reference for DSV4-Pro vLLM disagg + # at this seq-len yet (PR #67 only publishes 8k/1k). + - isl: 1024 osl: 1024 search-space: - # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each). - # 10 nodes total. Low TTFT/TPOT focus. - - conc-list: [4, 8, 16, 32, 128] + # Interactivity: 1 prefill (DP=8) + 4 decodes (DP=8 each). 10 nodes. + # NOTE: conc-list was [4, 8, 16, 32, 64, 128]; 4/16/32 dropped to shorten + # sweep runtime. Re-add them together with the 8k/1k block below. + - conc-list: [8, 64, 128] prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: true - # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - - conc-list: [512, 1024] + # Mid-to-high throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). + # 6 nodes. Single prefill is plenty for 1k prompts. + - conc-list: [256, 512, 1024, 2048, 3072, 4096] prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - decode: num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes - # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - - conc-list: [2048, 4096] - prefill: - num-worker: 7 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true + # --------------------------------------------------------------------- + # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the + # sweep-enabled gate while we collect 1k/1k data. Re-enable by + # uncommenting (remove the leading "# " on every line of the block + # below). The conc-lists already have 4/16/32 stripped — add them back + # together with the 1k/1k 1p4d block if you want the full sweep again. + # --------------------------------------------------------------------- + # - isl: 8192 + # osl: 1024 + # search-space: + # # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each). + # # 10 nodes total. Low TTFT/TPOT focus. + # # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped. + # - conc-list: [8, 128] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml" + # decode: + # num-worker: 4 + # tp: 8 + # ep: 8 + # dp-attn: true + # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. + # - conc-list: [512, 1024] + # prefill: + # num-worker: 3 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes + # # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. + # - conc-list: [2048, 4096] + # prefill: + # num-worker: 7 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml new file mode 100644 index 000000000..779bc8bae --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -0,0 +1,115 @@ +name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16" + +# 1k/1k mid-to-high throughput topology. Extrapolated from +# kimi-k2.5/1k1k/disagg-gb200-1p1d-dep4-dep16.yaml adjusted for DSV4-Pro's +# DP>=8 minimum. Single prefill worker feeding a wide DP=16 decode handles +# conc 256-4096 cleanly for 1k prompts (prefill throughput per rank is high +# enough at this prompt length; see kimi precedent). +# +# Differences from our 8k1k 7p1d-dep8-dep16: +# * prefill_workers: 1 (vs 7) — 1k prompts don't need 14 prefill nodes +# * max-model-len: 3072 instead of auto +# * prefill max-num-seqs: 16 (fills 16384-token budget at 1k per seq) +# * decode max-num-seqs: 512 instead of 256 (shorter KV, more parallelism) +# * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512 + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 3072 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x512x1024x2048x3072x4096" + req_rate: "inf" + use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml new file mode 100644 index 000000000..c6c6ee1dc --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -0,0 +1,114 @@ +name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8" + +# 1k/1k interactivity variant of the 8k/1k recipe with the same name (under +# ../8k1k/). Extrapolated from kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml +# adjusted for DSV4-Pro's DP>=8 minimum (kimi uses TP=4, we use DP=8 per +# worker since model layers don't fit at smaller GPU counts). +# +# Differences from our 8k1k 1p4d-dep8-dep8: +# * max-model-len: 3072 (1024 + 1024 + 1024 headroom) instead of auto/10240 +# * prefill max-num-seqs: 16 instead of 2 (1k prompts fit 16/batch within +# the same 16384 max-num-batched-tokens budget) +# * decode max-num-seqs: 128 instead of 32 (shorter KV = more headroom) +# * max-cudagraph-capture-size / max-num-batched-tokens (decode): 128 + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +health_check: + max_attempts: 720 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 3072 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64x128" + req_rate: "inf" + use_chat_template: false From 52b6a2e546012ef14f9472d1c3deec1c6988d5f8 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 09:47:25 -0700 Subject: [PATCH 12/28] Bump health-check and add slurm.time_limit to all DSV4 recipes Previous run reported "Model did not get healthy in 1800 seconds" on the 1k/1k 1p4d-dep8-dep8 recipe despite health_check.max_attempts being set to 720. 1800s is the srtctl default, so our override either wasn't applied or wasn't enough in the face of a cold-cache Lustre load. Double-down: * health_check.max_attempts: 720 -> 1440 (1800s -> 14400s = 4 hours) * slurm.time_limit: 8:00:00 explicit (srtslurm.yaml default is 6h, make it even wider so the SLURM wall clock can't cut off a slow load) Applied to all five recipes (1k/1k x2 and 8k/1k x3) so the fix carries over when the 8k/1k block in nvidia-master.yaml is re-enabled. --- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 12 +++++++++++- .../1k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 12 +++++++++++- .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 5 ++++- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 5 ++++- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 14 +++++++++----- 5 files changed, 39 insertions(+), 9 deletions(-) diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 779bc8bae..256db4028 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -24,8 +24,18 @@ dynamo: setup_script: vllm-container-deps.sh +# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so +# a slow first-time Lustre load + cudagraph capture can't get cut off by the +# SLURM wall clock. +slurm: + time_limit: "8:00:00" + +# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from +# Lustre with multiple workers contending for the same OSTs — previous 1k/1k +# run hit the default 1800s. Make this *very* generous since the cost of an +# over-long deadline is just sitting idle, not wasted compute. health_check: - max_attempts: 720 + max_attempts: 1440 interval_seconds: 10 resources: diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml index c6c6ee1dc..576b7c8c0 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -23,8 +23,18 @@ dynamo: setup_script: vllm-container-deps.sh +# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so +# a slow first-time Lustre load + cudagraph capture can't get cut off by the +# SLURM wall clock. +slurm: + time_limit: "8:00:00" + +# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from +# Lustre with multiple workers contending for the same OSTs — previous 1k/1k +# run hit the default 1800s. Make this *very* generous since the cost of an +# over-long deadline is just sitting idle, not wasted compute. health_check: - max_attempts: 720 + max_attempts: 1440 interval_seconds: 10 resources: diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml index 98f613adf..7fa5e47d2 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -18,8 +18,11 @@ dynamo: setup_script: vllm-container-deps.sh +slurm: + time_limit: "8:00:00" + health_check: - max_attempts: 720 + max_attempts: 1440 interval_seconds: 10 resources: diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 4c59e5a73..d6b750bf2 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -18,8 +18,11 @@ dynamo: setup_script: vllm-container-deps.sh +slurm: + time_limit: "8:00:00" + health_check: - max_attempts: 720 + max_attempts: 1440 interval_seconds: 10 resources: diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 318362ef1..695db772a 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -23,12 +23,16 @@ dynamo: setup_script: vllm-container-deps.sh -# Bump health-check from the 1800s default to 2 hours. DSV4-Pro (~850 GB -# FP4+FP8 weights) loads off Lustre slowly on a cold cache — observed -# ~33 min for 64 safetensor shards with 14 prefill workers contending for -# the same OSTs. 1800s isn't enough; 7200s gives headroom. +slurm: + time_limit: "8:00:00" + +# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads +# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor +# shards with 14 prefill workers contending for the same OSTs. The first +# bump to 7200s was still insufficient in one case, so pad generously to +# 14400s (4h). Over-long deadline only costs idle time, not compute. health_check: - max_attempts: 720 + max_attempts: 1440 interval_seconds: 10 resources: From 768cddcc8343d9759b0cbf5d5bea70a9324aaeeb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 12:58:52 -0700 Subject: [PATCH 13/28] Adopt NVIDIA srt-slurm PR #71 recipes (sans offload) for 8k/1k DSV4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces our hand-rolled 8k/1k DSV4-Pro vLLM disagg recipes with the four topologies from NVIDIA/srt-slurm PR #71 (source fork: alec-flowers/srt-slurm, branch aflowers/dsv4-pr67-pr68, pinned at commit d60e3f1c). PR #71 supersedes PR #67 that our original 8k/1k recipes were based on, with more topologies, a wider concurrency sweep per recipe, new env vars, explicit tokenizer-mode, and CPU/DRAM expert offload. We take everything except offload: * launch_gb200-nv.sh clones alec-flowers/srt-slurm for dsv4 instead of NVIDIA/srt-slurm. * Runtime post-clone patch strips `offload-group-size`, `offload-num-in-group`, `offload-prefetch-step`, and the commented `# offload-params` line from all four 8k/1k recipes. * Same post-clone patch injects our `slurm.time_limit: 8:00:00` and `health_check: {max_attempts: 1440, interval_seconds: 10}` (4 h budget) so the recipes match our cold-cache Lustre load budget. * Model-path alias changed from `deepseek-v4-pro` to `deepseekv4-fp4` to match PR #71 recipes' `model.path` field; 1k/1k local recipes updated to the same alias. * nvidia-master.yaml 8k/1k block rewritten: 4 search-space entries (1p1d-dep8-dep8, 3p1d-dep8-dep8, 3p1d-dep8-dep16, 6p1d-dep8-dep16), each running conc list [4, 8, 16, 32, 64, 256, 512, 1024] — 32 total 8k/1k benchmark points across 4 cluster startups. * Obsolete local 8k/1k recipes under srt-slurm-recipes/vllm/deepseek-v4/8k1k/ removed (superseded by the PR #71 upstream files). 1k/1k sweep is unchanged otherwise (2 matrix entries, 9 benchmark points using the hand-rolled recipes — no PR #71 equivalent at 1k/1k). --- .github/configs/nvidia-master.yaml | 119 +++++++++-------- runners/launch_gb200-nv.sh | 47 ++++++- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 +- .../1k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 2 +- .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 112 ---------------- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 112 ---------------- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 122 ------------------ 7 files changed, 109 insertions(+), 407 deletions(-) delete mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml delete mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml delete mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8f294462e..3841ed833 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7497,58 +7497,67 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true - # --------------------------------------------------------------------- - # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the - # sweep-enabled gate while we collect 1k/1k data. Re-enable by - # uncommenting (remove the leading "# " on every line of the block - # below). The conc-lists already have 4/16/32 stripped — add them back - # together with the 1k/1k 1p4d block if you want the full sweep again. - # --------------------------------------------------------------------- - # - isl: 8192 - # osl: 1024 - # search-space: - # # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each). - # # 10 nodes total. Low TTFT/TPOT focus. - # # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped. - # - conc-list: [8, 128] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml" - # decode: - # num-worker: 4 - # tp: 8 - # ep: 8 - # dp-attn: true - # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - # - conc-list: [512, 1024] - # prefill: - # num-worker: 3 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes - # # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - # - conc-list: [2048, 4096] - # prefill: - # num-worker: 7 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + # 8k/1k — four topologies from NVIDIA/srt-slurm PR #71 (the alec-flowers + # fork is cloned instead of NVIDIA/srt-slurm and patched at runtime to + # strip CPU/DRAM expert offload). Each recipe runs the full conc list + # [4, 8, 16, 32, 64, 256, 512, 1024] (8 points) giving cross-topology + # coverage. Total 8k/1k points: 32. + - isl: 8192 + osl: 1024 + search-space: + # 1p1d-dep8-dep8 — 1 prefill + 1 decode, each DP=8. 4 nodes. + - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 3p1d-dep8-dep8 — 3 prefill + 1 decode, each DP=8. 8 nodes. + - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-32-c2048-offload.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 3p1d-dep8-dep16 — 3 prefill (DP=8) + 1 wide decode (DP=16). 10 nodes. + - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c4096-offload.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 6p1d-dep8-dep16 — 6 prefill (DP=8) + 1 wide decode (DP=16). 16 nodes. + - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024] + prefill: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-6p1d-dep8-dep16-64-c8192-offload.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index db8bf4d4b..da321bdb4 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -43,8 +43,10 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Model path alias matches NVIDIA srt-slurm PR #71 recipes + # (`model.path: "deepseekv4-fp4"`). export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Pro" - export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + export SRT_SLURM_MODEL_PREFIX="deepseekv4-fp4" else echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4" exit 1 @@ -66,7 +68,7 @@ export SLURM_ACCOUNT="benchmark" # .stage-complete marker. if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then LUSTRE_SRC="$MODEL_PATH" - STAGED_MODEL_PATH="/mnt/numa0/cache/deepseek-v4-pro" + STAGED_MODEL_PATH="/mnt/numa0/cache/deepseekv4-fp4" STAGE_MARKER="$STAGED_MODEL_PATH/.stage-complete" # Total node count == prefill_nodes + decode_nodes from the recipe (7p1d-dep8-dep16 = 14+4) STAGE_NODES=18 @@ -176,10 +178,47 @@ if [ -d "$SRT_REPO_DIR" ]; then fi if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then - git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + # alec-flowers/srt-slurm, branch aflowers/dsv4-pr67-pr68 + # (https://github.com/NVIDIA/srt-slurm/pull/71) — supersedes PR #67 with + # 4 GB200 DSV4-Pro vLLM disagg recipes (1p1d, 3p1d-dep8, 3p1d-dep16, + # 6p1d-dep16), NUMA binding, new env vars, and explicit tokenizer-mode. + # Pinned to PR #71 head for reproducibility. + git clone https://github.com/alec-flowers/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout sa-submission-q2-2026 + git checkout d60e3f1c7921721e52af01afaab59a70a1631106 + # Copy our hand-rolled 1k/1k recipes (no upstream equivalent for vLLM + # disagg at 1k/1k yet). 8k/1k recipes come from the upstream clone. cp -r "$GITHUB_WORKSPACE/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 + # PR #71's 8k/1k recipes include CPU/DRAM expert offload (offload-* + # knobs + a companion vllm_numa_bind_hash_fix.py patch). Strip the + # offload lines and inject our health_check + slurm.time_limit + # overrides so the recipes run without offload and with a generous + # cold-cache Lustre load budget. + python3 - <<'PY' +from pathlib import Path +for p in Path("recipes/vllm/deepseek-v4-pro/8k1k").glob("disagg-gb200-*.yaml"): + text = p.read_text() + # Drop offload-* knobs and the commented `# offload-params:` line. + kept = [] + for line in text.splitlines(): + stripped = line.lstrip() + if stripped.startswith("offload-") or stripped.startswith("# offload-params:"): + continue + kept.append(line) + text = "\n".join(kept) + ("\n" if text.endswith("\n") else "") + # Inject slurm.time_limit and health_check overrides after setup_script. + marker = "setup_script: vllm-container-deps.sh\n" + if marker in text and "health_check:" not in text: + text = text.replace( + marker, + marker + + "\nslurm:\n time_limit: \"8:00:00\"\n" + + "\nhealth_check:\n max_attempts: 1440\n interval_seconds: 10\n", + 1, + ) + p.write_text(text) + print(f"patched {p}") +PY elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 256db4028..4204c26b5 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -14,7 +14,7 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16" # * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512 model: - path: "deepseek-v4-pro" + path: "deepseekv4-fp4" container: "vllm/vllm-openai:deepseekv4-cu130" precision: "fp4" diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml index 576b7c8c0..9981de640 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -13,7 +13,7 @@ name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8" # * max-cudagraph-capture-size / max-num-batched-tokens (decode): 128 model: - path: "deepseek-v4-pro" + path: "deepseekv4-fp4" container: "vllm/vllm-openai:deepseekv4-cu130" precision: "fp4" diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml deleted file mode 100644 index 7fa5e47d2..000000000 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ /dev/null @@ -1,112 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8" - -# Interactivity-focused topology: 1 prefill worker + 4 separate decode -# workers, each at DP=8. Targets conc 4-128 where TTFT/TPOT matter more -# than aggregate throughput. Same per-worker vllm_config as the NVIDIA -# 7p1d reference (PR #67); only resources, decode capacity (max-num-seqs -# / cudagraph capture / batched-tokens), and benchmark concurrencies -# differ. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 8 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: auto - max-num-seqs: 32 - max-cudagraph-capture-size: 32 - max-num-batched-tokens: 32 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x16x32x128" - req_rate: "inf" - use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml deleted file mode 100644 index d6b750bf2..000000000 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ /dev/null @@ -1,112 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" - -# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single -# wide decode (DP=16). Targets conc 512-1024 where a single big decode -# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d -# reference (PR #67); only resources, prefill_workers count, and -# benchmark concurrencies differ. Decode capacity matches 7p1d -# (max-num-seqs=256) since the decode topology itself is identical. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 4 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: auto - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x1024" - req_rate: "inf" - use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml deleted file mode 100644 index 695db772a..000000000 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ /dev/null @@ -1,122 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16" - -# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra -# benchmark flag: use_chat_template=false. The HF tokenizer for -# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's -# --use-chat-template path calls tokenizer.apply_chat_template() and raises -# ValueError. Throughput benchmarking uses /v1/completions with random tokens -# anyway — no chat template needed. -# -# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a -# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/ -# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and -# uses this native formatter — no custom Jinja template required. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads -# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor -# shards with 14 prefill workers contending for the same OSTs. The first -# bump to 7200s was still insufficient in one case, so pad generously to -# 14400s (4h). Over-long deadline only costs idle time, not compute. -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 14 - decode_nodes: 4 - prefill_workers: 7 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 16 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: auto - max-num-seqs: 256 - max-cudagraph-capture-size: 256 - max-num-batched-tokens: 256 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "2048x4096" - req_rate: "inf" - use_chat_template: false From af10ca0c63ffeee76276db421ed8185823e9737e Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 13:16:39 -0700 Subject: [PATCH 14/28] path --- runners/launch_gb200-nv.sh | 43 +++----------------------------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index da321bdb4..40a884086 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -44,8 +44,9 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then # Model path alias matches NVIDIA srt-slurm PR #71 recipes - # (`model.path: "deepseekv4-fp4"`). - export MODEL_PATH="/mnt/lustre01/users/sa-shared/DeepSeek-V4-Pro" + # (`model.path: "deepseekv4-fp4"`). Weights live on compute-node + # local NVMe (/mnt/numa1) for fast startup — no Lustre contention. + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" export SRT_SLURM_MODEL_PREFIX="deepseekv4-fp4" else echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4" @@ -59,44 +60,6 @@ fi export SLURM_PARTITION="batch" export SLURM_ACCOUNT="benchmark" -# ---- DSV4 weight pre-stage to compute-node-local NVMe ---- -# DSV4-Pro (~850 GB FP4+FP8 weights) loads too slowly from Lustre: 14 prefill -# workers contending for the same OSTs stretches the load past srtctl's -# health-check deadline. Stage once onto /mnt/numa0 (14T local NVMe RAID per -# compute node) via srun across all 18 batch-partition nodes before launching -# srtctl. Subsequent runs hit the local copy and skip the rsync via the -# .stage-complete marker. -if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then - LUSTRE_SRC="$MODEL_PATH" - STAGED_MODEL_PATH="/mnt/numa0/cache/deepseekv4-fp4" - STAGE_MARKER="$STAGED_MODEL_PATH/.stage-complete" - # Total node count == prefill_nodes + decode_nodes from the recipe (7p1d-dep8-dep16 = 14+4) - STAGE_NODES=18 - - echo "Pre-staging DSV4 weights $LUSTRE_SRC -> $STAGED_MODEL_PATH on $STAGE_NODES nodes..." - if srun --account="$SLURM_ACCOUNT" --partition="$SLURM_PARTITION" \ - --nodes="$STAGE_NODES" --ntasks-per-node=1 \ - --time=40:00 --job-name=dsv4-prestage --exclusive \ - bash -c ' - set -e - host=$(hostname) - if [ -f "'"$STAGE_MARKER"'" ]; then - echo "[$host] already staged, skipping" - exit 0 - fi - mkdir -p "'"$STAGED_MODEL_PATH"'" - echo "[$host] rsync start: $(date -u +%H:%M:%S)" - time rsync -a --whole-file --info=stats2 "'"$LUSTRE_SRC"'/" "'"$STAGED_MODEL_PATH"'/" - touch "'"$STAGE_MARKER"'" - echo "[$host] rsync done: $(date -u +%H:%M:%S)" - '; then - echo "Pre-stage complete; pointing MODEL_PATH at local copy" - export MODEL_PATH="$STAGED_MODEL_PATH" - else - echo "WARNING: pre-stage failed (srun exit $?); falling back to Lustre MODEL_PATH=$LUSTRE_SRC" - fi -fi - NGINX_IMAGE="nginx:1.27.4" SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" From f5245845e92ae8527774b2f5e47098ad916438c7 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 14:06:01 -0700 Subject: [PATCH 15/28] Revert "Adopt NVIDIA srt-slurm PR #71 recipes (sans offload) for 8k/1k DSV4" This reverts commit 768cddcc8343d9759b0cbf5d5bea70a9324aaeeb. --- .github/configs/nvidia-master.yaml | 119 ++++++++--------- runners/launch_gb200-nv.sh | 49 +------ .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 +- .../1k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 2 +- .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 112 ++++++++++++++++ .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 112 ++++++++++++++++ .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 122 ++++++++++++++++++ 7 files changed, 409 insertions(+), 109 deletions(-) create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3841ed833..8f294462e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7497,67 +7497,58 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true - # 8k/1k — four topologies from NVIDIA/srt-slurm PR #71 (the alec-flowers - # fork is cloned instead of NVIDIA/srt-slurm and patched at runtime to - # strip CPU/DRAM expert offload). Each recipe runs the full conc list - # [4, 8, 16, 32, 64, 256, 512, 1024] (8 points) giving cross-topology - # coverage. Total 8k/1k points: 32. - - isl: 8192 - osl: 1024 - search-space: - # 1p1d-dep8-dep8 — 1 prefill + 1 decode, each DP=8. 4 nodes. - - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - # 3p1d-dep8-dep8 — 3 prefill + 1 decode, each DP=8. 8 nodes. - - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-32-c2048-offload.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - # 3p1d-dep8-dep16 — 3 prefill (DP=8) + 1 wide decode (DP=16). 10 nodes. - - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c4096-offload.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # 6p1d-dep8-dep16 — 6 prefill (DP=8) + 1 wide decode (DP=16). 16 nodes. - - conc-list: [4, 8, 16, 32, 64, 256, 512, 1024] - prefill: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-6p1d-dep8-dep16-64-c8192-offload.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + # --------------------------------------------------------------------- + # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the + # sweep-enabled gate while we collect 1k/1k data. Re-enable by + # uncommenting (remove the leading "# " on every line of the block + # below). The conc-lists already have 4/16/32 stripped — add them back + # together with the 1k/1k 1p4d block if you want the full sweep again. + # --------------------------------------------------------------------- + # - isl: 8192 + # osl: 1024 + # search-space: + # # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each). + # # 10 nodes total. Low TTFT/TPOT focus. + # # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped. + # - conc-list: [8, 128] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml" + # decode: + # num-worker: 4 + # tp: 8 + # ep: 8 + # dp-attn: true + # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. + # - conc-list: [512, 1024] + # prefill: + # num-worker: 3 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes + # # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. + # - conc-list: [2048, 4096] + # prefill: + # num-worker: 7 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 40a884086..6c8e706f1 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -43,11 +43,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/mnt/lustre01/models/kimi-k2.5-nvfp4" export SRT_SLURM_MODEL_PREFIX="kimi-k2.5-nvfp4" elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then - # Model path alias matches NVIDIA srt-slurm PR #71 recipes - # (`model.path: "deepseekv4-fp4"`). Weights live on compute-node - # local NVMe (/mnt/numa1) for fast startup — no Lustre contention. + # Weights live on compute-node local NVMe (/mnt/numa1) — no Lustre + # contention, fast startup. SRT_SLURM_MODEL_PREFIX matches the + # model.path alias in our DSV4 recipes. export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" - export SRT_SLURM_MODEL_PREFIX="deepseekv4-fp4" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4" exit 1 @@ -141,47 +141,10 @@ if [ -d "$SRT_REPO_DIR" ]; then fi if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then - # alec-flowers/srt-slurm, branch aflowers/dsv4-pr67-pr68 - # (https://github.com/NVIDIA/srt-slurm/pull/71) — supersedes PR #67 with - # 4 GB200 DSV4-Pro vLLM disagg recipes (1p1d, 3p1d-dep8, 3p1d-dep16, - # 6p1d-dep16), NUMA binding, new env vars, and explicit tokenizer-mode. - # Pinned to PR #71 head for reproducibility. - git clone https://github.com/alec-flowers/srt-slurm.git "$SRT_REPO_DIR" + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" - git checkout d60e3f1c7921721e52af01afaab59a70a1631106 - # Copy our hand-rolled 1k/1k recipes (no upstream equivalent for vLLM - # disagg at 1k/1k yet). 8k/1k recipes come from the upstream clone. + git checkout sa-submission-q2-2026 cp -r "$GITHUB_WORKSPACE/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 - # PR #71's 8k/1k recipes include CPU/DRAM expert offload (offload-* - # knobs + a companion vllm_numa_bind_hash_fix.py patch). Strip the - # offload lines and inject our health_check + slurm.time_limit - # overrides so the recipes run without offload and with a generous - # cold-cache Lustre load budget. - python3 - <<'PY' -from pathlib import Path -for p in Path("recipes/vllm/deepseek-v4-pro/8k1k").glob("disagg-gb200-*.yaml"): - text = p.read_text() - # Drop offload-* knobs and the commented `# offload-params:` line. - kept = [] - for line in text.splitlines(): - stripped = line.lstrip() - if stripped.startswith("offload-") or stripped.startswith("# offload-params:"): - continue - kept.append(line) - text = "\n".join(kept) + ("\n" if text.endswith("\n") else "") - # Inject slurm.time_limit and health_check overrides after setup_script. - marker = "setup_script: vllm-container-deps.sh\n" - if marker in text and "health_check:" not in text: - text = text.replace( - marker, - marker - + "\nslurm:\n time_limit: \"8:00:00\"\n" - + "\nhealth_check:\n max_attempts: 1440\n interval_seconds: 10\n", - 1, - ) - p.write_text(text) - print(f"patched {p}") -PY elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 4204c26b5..256db4028 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -14,7 +14,7 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-dep16" # * max-cudagraph-capture-size / max-num-batched-tokens (decode): 512 model: - path: "deepseekv4-fp4" + path: "deepseek-v4-pro" container: "vllm/vllm-openai:deepseekv4-cu130" precision: "fp4" diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml index 9981de640..576b7c8c0 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -13,7 +13,7 @@ name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8" # * max-cudagraph-capture-size / max-num-batched-tokens (decode): 128 model: - path: "deepseekv4-fp4" + path: "deepseek-v4-pro" container: "vllm/vllm-openai:deepseekv4-cu130" precision: "fp4" diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml new file mode 100644 index 000000000..7fa5e47d2 --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -0,0 +1,112 @@ +name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8" + +# Interactivity-focused topology: 1 prefill worker + 4 separate decode +# workers, each at DP=8. Targets conc 4-128 where TTFT/TPOT matter more +# than aggregate throughput. Same per-worker vllm_config as the NVIDIA +# 7p1d reference (PR #67); only resources, decode capacity (max-num-seqs +# / cudagraph capture / batched-tokens), and benchmark concurrencies +# differ. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: auto + max-num-seqs: 2 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: auto + max-num-seqs: 32 + max-cudagraph-capture-size: 32 + max-num-batched-tokens: 32 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16x32x128" + req_rate: "inf" + use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..d6b750bf2 --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -0,0 +1,112 @@ +name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" + +# Mid-concurrency topology: 3 prefill workers (DP=8) feeding a single +# wide decode (DP=16). Targets conc 512-1024 where a single big decode +# batches efficiently. Same per-worker vllm_config as the NVIDIA 7p1d +# reference (PR #67); only resources, prefill_workers count, and +# benchmark concurrencies differ. Decode capacity matches 7p1d +# (max-num-seqs=256) since the decode topology itself is identical. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: auto + max-num-seqs: 2 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: auto + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" + req_rate: "inf" + use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml new file mode 100644 index 000000000..695db772a --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -0,0 +1,122 @@ +name: "dsv4-vllm-disagg-gb200-7p1d-dep8-dep16" + +# Mirrors NVIDIA/srt-slurm PR #67 except for our local name and one extra +# benchmark flag: use_chat_template=false. The HF tokenizer for +# deepseek-ai/DeepSeek-V4-Pro ships no chat_template, so sa-bench's +# --use-chat-template path calls tokenizer.apply_chat_template() and raises +# ValueError. Throughput benchmarking uses /v1/completions with random tokens +# anyway — no chat template needed. +# +# The dynamo hash (6a159fed, 2026-04-23) pins to the commit that adds a +# native Rust DeepSeekV4Formatter in lib/llm/src/preprocessor/prompt/ +# deepseek_v4.rs. Dynamo's frontend auto-detects DSV4 by model name and +# uses this native formatter — no custom Jinja template required. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +# Bumped from the 1800s default. DSV4-Pro (~850 GB FP4+FP8 weights) loads +# off Lustre slowly on a cold cache — observed ~33 min for 64 safetensor +# shards with 14 prefill workers contending for the same OSTs. The first +# bump to 7200s was still insufficient in one case, so pad generously to +# 14400s (4h). Over-long deadline only costs idle time, not compute. +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 14 + decode_nodes: 4 + prefill_workers: 7 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: auto + max-num-seqs: 2 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: auto + max-num-seqs: 256 + max-cudagraph-capture-size: 256 + max-num-batched-tokens: 256 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048x4096" + req_rate: "inf" + use_chat_template: false From 18100e54697ad26ecb18ceb08c58dc0568afbdb4 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 14:47:18 -0700 Subject: [PATCH 16/28] Add 1k/1k 3p1d-dep8-dep16 recipe for high concurrency (4096, 8192) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing 1k/1k 1p1d-dep8-dep16 recipe runs out of prefill at conc>=8192 — single DP=8 prefill worker can sustain ~80-150K tok/s, not the ~200-300K tok/s of demand at conc=8192. New 3p1d-dep8-dep16 recipe adds 2 more prefill workers (10 nodes total). Decode capacity bumped to max-num-seqs=1024 (vs 512 in 1p1d) so conc=8192 has headroom (per-rank 8192/16 = 512, well below 1024). max-cudagraph-capture-size kept at 512 — steady-state per-rank batch is ~512 so cudagraphs still apply. conc-list overlap at 4096 between the two topologies gives a direct crossover comparison point. --- .github/configs/nvidia-master.yaml | 20 ++- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 117 ++++++++++++++++++ 2 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 8f294462e..6bac5ee98 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7482,8 +7482,8 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 8 ep: 8 dp-attn: true - # Mid-to-high throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). - # 6 nodes. Single prefill is plenty for 1k prompts. + # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). + # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - conc-list: [256, 512, 1024, 2048, 3072, 4096] prefill: num-worker: 1 @@ -7497,6 +7497,22 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. + # The 4096 overlap with the 1p1d block gives a crossover point. 8192 + # would saturate 1p1d's prefill, so this topology takes over there. + - conc-list: [4096, 8192] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true # --------------------------------------------------------------------- # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the # sweep-enabled gate while we collect 1k/1k data. Re-enable by diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..63e9e280c --- /dev/null +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -0,0 +1,117 @@ +name: "dsv4-vllm-disagg-gb200-3p1d-dep8-dep16" + +# 1k/1k high-throughput topology: 3 prefill workers (DP=8) feeding a single +# wide decode (DP=16). 10 nodes total. Sized for conc 4096-8192 — at those +# concurrencies a single prefill worker (the 1p1d-dep8-dep16 sibling) +# becomes the bottleneck since 1k prefill arrival rate ~200-300 req/s +# exceeds what one DP=8 worker can sustain. +# +# Decode capacity: +# max-num-seqs: 1024 with DP=16 -> 16384 total simultaneous slots, which +# leaves headroom over the conc=8192 working set (per-rank avg 512). +# max-cudagraph-capture-size kept at 512: per-rank batch at conc=8192 is +# ~512 so cudagraphs still apply at steady state. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 3072 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 16 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 3072 + max-num-seqs: 1024 + max-cudagraph-capture-size: 512 + max-num-batched-tokens: 1024 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x8192" + req_rate: "inf" + use_chat_template: false From 84be0b3a1b21007386e9f6a7cb82c8a3deda7abe Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 14:48:50 -0700 Subject: [PATCH 17/28] change concs --- .../vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 +- .../vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 2 +- .../vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 2 +- .../vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 256db4028..75b3d2770 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -120,6 +120,6 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "256x512x1024x2048x3072x4096" + concurrencies: "256x512x1024x2048x4096" req_rate: "inf" use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml index 576b7c8c0..59427712c 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -119,6 +119,6 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "4x8x16x32x64x128" + concurrencies: "8x32x64x128" req_rate: "inf" use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml index 7fa5e47d2..ef6dcdc24 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -107,6 +107,6 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4x8x16x32x128" + concurrencies: "8x32x128" req_rate: "inf" use_chat_template: false diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 695db772a..6213373b3 100644 --- a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -117,6 +117,6 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "2048x4096" + concurrencies: "4096x8192" req_rate: "inf" use_chat_template: false From 8b1fbe29fbc277c775318aa1dfaa7a353cf593e4 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 15:41:09 -0700 Subject: [PATCH 18/28] Move srt-slurm-recipes/ under benchmarks/multi_node/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recipes are part of the multi-node benchmark plumbing — they belong next to the other multi-node assets (amd_utils/, dsr1_*_sglang-disagg.sh, gptoss_fp4_gb200_dynamo-trt.sh) rather than at the repo root. Updates the launch script's `cp -r` source path. The reference in perf-changelog.yaml's historical entry is left untouched (additions-only gate; it's only a description string). --- .../vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 0 .../vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 0 .../vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 0 .../vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 0 .../vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 0 .../vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 0 runners/launch_gb200-nv.sh | 2 +- 7 files changed, 1 insertion(+), 1 deletion(-) rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml (100%) rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml (100%) rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml (100%) rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml (100%) rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml (100%) rename {srt-slurm-recipes => benchmarks/multi_node/srt-slurm-recipes}/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml (100%) diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml similarity index 100% rename from srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml similarity index 100% rename from srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml diff --git a/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml similarity index 100% rename from srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml similarity index 100% rename from srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml similarity index 100% rename from srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml similarity index 100% rename from srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 6c8e706f1..45d49c09b 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -144,7 +144,7 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 - cp -r "$GITHUB_WORKSPACE/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 + cp -r "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" From e095e00b8cdb7c1f37d82819593614051e1f9220 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 16:17:33 -0700 Subject: [PATCH 19/28] Add 1p4d-dep8-tep8 TEP recipes for low concurrency (1k/1k + 8k/1k) Decode workers use TP=8 within each worker (no data-parallel decode), sheds attention-layer memory pressure compared to the dep8-dep8 sibling at the cost of an inter-rank TP all-reduce per attention layer. Each rank holds: * dep8 sibling: full attention replica + 1/8 of experts (EP=8) * tep8 (this): 1/8 of attention (TP=8 sharded) + 1/8 experts (EP=8) Same node count (10) and same conc-list as the dep8-dep8 sibling so the two are directly comparable. Useful at low concurrency where TP all-reduce overhead is a smaller fraction of step time. Topology pattern derived from kimi-k2.5/{1k1k,8k1k}/disagg-gb200-1p4d- dep4-tep4.yaml (the only vLLM disagg TEP precedent on GB200 in upstream srt-slurm). Scaled to TP=8 because DSV4-Pro's attention layers don't fit the per-rank budget at TP=4. nvidia-master.yaml: * Adds the 1k/1k TEP entry as a sibling to the existing dep8-dep8 entry (same conc-list [8, 64, 128], active). * Adds the 8k/1k TEP entry inside the still-commented 8k/1k block (conc-list [8, 128]) so it's present when 8k/1k is re-enabled. --- .github/configs/nvidia-master.yaml | 38 +++++- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 +- .../1k1k/disagg-gb200-1p4d-dep8-tep8.yaml | 125 ++++++++++++++++++ .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 2 +- .../8k1k/disagg-gb200-1p4d-dep8-tep8.yaml | 119 +++++++++++++++++ 5 files changed, 282 insertions(+), 4 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6bac5ee98..d1829c64b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7466,7 +7466,8 @@ dsv4-fp4-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Interactivity: 1 prefill (DP=8) + 4 decodes (DP=8 each). 10 nodes. + # Interactivity (DP-decode): 1 prefill (DP=8) + 4 decodes (DP=8 each). + # 10 nodes. Each decode rank holds full attention replica + 1/8 experts. # NOTE: conc-list was [4, 8, 16, 32, 64, 128]; 4/16/32 dropped to shorten # sweep runtime. Re-add them together with the 8k/1k block below. - conc-list: [8, 64, 128] @@ -7482,6 +7483,24 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 8 ep: 8 dp-attn: true + # Interactivity (TEP-decode): 1 prefill (DP=8) + 4 decodes (TP=8 each). + # 10 nodes — same node count as the dep8-dep8 sibling. Each decode rank + # holds 1/8 of attention (TP-sharded) + 1/8 of experts (EP), trading + # weight-memory headroom for an inter-rank TP all-reduce per attention + # layer. Same conc-list as the dep8 entry so they're directly comparable. + - conc-list: [8, 64, 128] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 1 + dp-attn: false # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - conc-list: [256, 512, 1024, 2048, 3072, 4096] @@ -7523,7 +7542,7 @@ dsv4-fp4-gb200-dynamo-vllm: # - isl: 8192 # osl: 1024 # search-space: - # # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each). + # # Interactivity (DP-decode): 1 prefill (DP=8) + 4 decodes (DP=8 each). # # 10 nodes total. Low TTFT/TPOT focus. # # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped. # - conc-list: [8, 128] @@ -7539,6 +7558,21 @@ dsv4-fp4-gb200-dynamo-vllm: # tp: 8 # ep: 8 # dp-attn: true + # # Interactivity (TEP-decode): 1 prefill (DP=8) + 4 decodes (TP=8 each). + # # 10 nodes. Same conc-list as the dep8 sibling for direct A/B. + # - conc-list: [8, 128] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml" + # decode: + # num-worker: 4 + # tp: 8 + # ep: 1 + # dp-attn: false # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. # - conc-list: [512, 1024] # prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 75b3d2770..bf5b441b9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -120,6 +120,6 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "256x512x1024x2048x4096" + concurrencies: "128x256x1024x2048x4096" req_rate: "inf" use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml new file mode 100644 index 000000000..b4567b5ce --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml @@ -0,0 +1,125 @@ +name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tep8" + +# 1k/1k TEP variant for low concurrency (4-128). +# +# Decode workers use tensor parallelism (TP=8) within each worker instead +# of data parallelism. Each rank holds 1/8 of attention/embedding (sharded) +# plus 1/8 of experts (EP) — vs the dep8 variant where each rank holds the +# full attention replica plus 1/8 of experts. TEP frees ~80-160 GB per rank +# of weight memory at the cost of an inter-rank TP all-reduce on every +# attention layer. At low conc (where attention all-reduce overhead is a +# smaller fraction of step time), this can be a net win on TTFT/TPOT. +# +# Topology: 1 prefill (DP=8) + 4 decode (TP=8 each). 10 nodes. Same node +# count as 1p4d-dep8-dep8, different memory split. +# +# Extrapolated from kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml — the +# only vLLM disagg TEP precedent on GB200 in upstream srt-slurm. Scaled +# from kimi's TP=4 to TP=8 because DSV4-Pro is too large to TP-shard at 4. +# No upstream NVIDIA reference for DSV4-Pro TEP yet. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 3072 + max-num-seqs: 16 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + # TP=8 across 8 GPUs (one node per worker). No data-parallel-size. + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 3072 + # 4 decode workers x 128 = 512 total simultaneous slots, well above + # max conc=128 in this entry. KV is TP=8-sharded so per-rank KV is + # 1/8 the dep8 case; we can afford the larger max-num-seqs. + max-num-seqs: 128 + max-cudagraph-capture-size: 128 + max-num-batched-tokens: 128 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8x64x128" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml index ef6dcdc24..0b000b8e3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml @@ -107,6 +107,6 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "8x32x128" + concurrencies: "4x8x32x64" req_rate: "inf" use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml new file mode 100644 index 000000000..e11c9a361 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml @@ -0,0 +1,119 @@ +name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tep8" + +# 8k/1k TEP variant for low concurrency (4-128). +# +# See ../1k1k/disagg-gb200-1p4d-dep8-tep8.yaml for the rationale (TP=8 +# decode workers shed attention-layer memory pressure vs the dep8 sibling +# at the cost of TP all-reduce latency). +# +# Differences from the 1k/1k version: +# * max-model-len: auto (matches NVIDIA PR #67-style 8k/1k recipes) +# * prefill max-num-seqs: 2 (NVIDIA's value — 8k inputs fill the 16384 +# max-num-batched-tokens budget at 2 prefills/batch) +# * decode max-num-seqs: 64 (KV is 8x larger per request than 1k/1k; +# even with TP=8 sharding, conservative max-num-seqs vs the 128 in +# the 1k/1k sibling. 4 workers x 64 = 256 simultaneous, plenty for +# the conc=128 max in this entry.) + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 8 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + decode_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: auto + max-num-seqs: 2 + max-num-batched-tokens: 16384 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + block-size: 256 + gpu-memory-utilization: 0.88 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + # TP=8 across 8 GPUs (one node per worker). No data-parallel-size. + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: auto + max-num-seqs: 64 + max-cudagraph-capture-size: 64 + max-num-batched-tokens: 64 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8x128" + req_rate: "inf" + use_chat_template: false From 4666f607cf26fbb7ea064440cc971e5cd7b3647a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 16:27:39 -0700 Subject: [PATCH 20/28] conc changes --- .../vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml | 2 +- .../vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml index b4567b5ce..049d6d55f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml @@ -120,6 +120,6 @@ benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "8x64x128" + concurrencies: "4x16x64x128" req_rate: "inf" use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml index e11c9a361..6dee55304 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml @@ -114,6 +114,6 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "8x128" + concurrencies: "4x16x64x128" req_rate: "inf" use_chat_template: false From a51db718a5770602736308a81d08660a43307142 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 16:38:23 -0700 Subject: [PATCH 21/28] perfchangelog --- perf-changelog.yaml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e53e2f66a..84737bd78 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,12 +1,4 @@ - config-keys: -<<<<<<< dsv4-fp4-gb200-dynamo-vllm-disagg - - dsv4-fp4-gb200-dynamo-vllm - description: - - "Add DeepSeek V4 Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 7p1d-dep8-dep16)" - - "Container: vllm/vllm-openai:deepseekv4-cu130" - - "Mirrors NVIDIA/srt-slurm PR #67; recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 -======= - dsv4-fp4-b200-sglang description: - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)" @@ -15,7 +7,6 @@ - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config" - "Prefix caching and speculative decoding disabled for baseline numbers" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131 ->>>>>>> main - config-keys: - dsr1-fp8-h100-dynamo-trt @@ -1775,3 +1766,11 @@ - "Prefix caching disabled, no speculative decoding" - "Configs: 1k1k conc 4-1024, 8k1k conc 4-512" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143 + +- config-keys: + - dsv4-fp4-gb200-dynamo-vllm + description: + - "Add DeepSeek V4 Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 7p1d-dep8-dep16)" + - "Container: vllm/vllm-openai:deepseekv4-cu130" + - "Mirrors NVIDIA/srt-slurm PR #67; recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 From c23c9fa9560724b71ea8f89b5001546782c0cfc0 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 18:03:17 -0700 Subject: [PATCH 22/28] Undo 1p4d-dep8-tep8 TEP recipes Reverts the experimental TEP-decode variant for low concurrency. Removes both 1k/1k and 8k/1k recipe files plus the active 1k/1k search-space entry and the (still-commented) 8k/1k entry in nvidia-master.yaml. Reverts the 'Interactivity (DP-decode)' / 'Interactivity (TEP-decode)' naming back to plain 'Interactivity' on the dep8-dep8 entries. --- .github/configs/nvidia-master.yaml | 38 +----- .../1k1k/disagg-gb200-1p4d-dep8-tep8.yaml | 125 ------------------ .../8k1k/disagg-gb200-1p4d-dep8-tep8.yaml | 119 ----------------- 3 files changed, 2 insertions(+), 280 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a5b4087a8..97ebc9c67 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7530,8 +7530,7 @@ dsv4-fp4-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Interactivity (DP-decode): 1 prefill (DP=8) + 4 decodes (DP=8 each). - # 10 nodes. Each decode rank holds full attention replica + 1/8 experts. + # Interactivity: 1 prefill (DP=8) + 4 decodes (DP=8 each). 10 nodes. # NOTE: conc-list was [4, 8, 16, 32, 64, 128]; 4/16/32 dropped to shorten # sweep runtime. Re-add them together with the 8k/1k block below. - conc-list: [8, 64, 128] @@ -7547,24 +7546,6 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 8 ep: 8 dp-attn: true - # Interactivity (TEP-decode): 1 prefill (DP=8) + 4 decodes (TP=8 each). - # 10 nodes — same node count as the dep8-dep8 sibling. Each decode rank - # holds 1/8 of attention (TP-sharded) + 1/8 of experts (EP), trading - # weight-memory headroom for an inter-rank TP all-reduce per attention - # layer. Same conc-list as the dep8 entry so they're directly comparable. - - conc-list: [8, 64, 128] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml" - decode: - num-worker: 4 - tp: 8 - ep: 1 - dp-attn: false # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - conc-list: [256, 512, 1024, 2048, 3072, 4096] @@ -7606,7 +7587,7 @@ dsv4-fp4-gb200-dynamo-vllm: # - isl: 8192 # osl: 1024 # search-space: - # # Interactivity (DP-decode): 1 prefill (DP=8) + 4 decodes (DP=8 each). + # # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each). # # 10 nodes total. Low TTFT/TPOT focus. # # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped. # - conc-list: [8, 128] @@ -7622,21 +7603,6 @@ dsv4-fp4-gb200-dynamo-vllm: # tp: 8 # ep: 8 # dp-attn: true - # # Interactivity (TEP-decode): 1 prefill (DP=8) + 4 decodes (TP=8 each). - # # 10 nodes. Same conc-list as the dep8 sibling for direct A/B. - # - conc-list: [8, 128] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml" - # decode: - # num-worker: 4 - # tp: 8 - # ep: 1 - # dp-attn: false # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. # - conc-list: [512, 1024] # prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml deleted file mode 100644 index 049d6d55f..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-tep8.yaml +++ /dev/null @@ -1,125 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tep8" - -# 1k/1k TEP variant for low concurrency (4-128). -# -# Decode workers use tensor parallelism (TP=8) within each worker instead -# of data parallelism. Each rank holds 1/8 of attention/embedding (sharded) -# plus 1/8 of experts (EP) — vs the dep8 variant where each rank holds the -# full attention replica plus 1/8 of experts. TEP frees ~80-160 GB per rank -# of weight memory at the cost of an inter-rank TP all-reduce on every -# attention layer. At low conc (where attention all-reduce overhead is a -# smaller fraction of step time), this can be a net win on TTFT/TPOT. -# -# Topology: 1 prefill (DP=8) + 4 decode (TP=8 each). 10 nodes. Same node -# count as 1p4d-dep8-dep8, different memory split. -# -# Extrapolated from kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml — the -# only vLLM disagg TEP precedent on GB200 in upstream srt-slurm. Scaled -# from kimi's TP=4 to TP=8 because DSV4-Pro is too large to TP-shard at 4. -# No upstream NVIDIA reference for DSV4-Pro TEP yet. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 8 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: 3072 - max-num-seqs: 16 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - # TP=8 across 8 GPUs (one node per worker). No data-parallel-size. - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: 3072 - # 4 decode workers x 128 = 512 total simultaneous slots, well above - # max conc=128 in this entry. KV is TP=8-sharded so per-rank KV is - # 1/8 the dep8 case; we can afford the larger max-num-seqs. - max-num-seqs: 128 - max-cudagraph-capture-size: 128 - max-num-batched-tokens: 128 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4x16x64x128" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml deleted file mode 100644 index 6dee55304..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-tep8.yaml +++ /dev/null @@ -1,119 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-1p4d-dep8-tep8" - -# 8k/1k TEP variant for low concurrency (4-128). -# -# See ../1k1k/disagg-gb200-1p4d-dep8-tep8.yaml for the rationale (TP=8 -# decode workers shed attention-layer memory pressure vs the dep8 sibling -# at the cost of TP all-reduce latency). -# -# Differences from the 1k/1k version: -# * max-model-len: auto (matches NVIDIA PR #67-style 8k/1k recipes) -# * prefill max-num-seqs: 2 (NVIDIA's value — 8k inputs fill the 16384 -# max-num-batched-tokens budget at 2 prefills/batch) -# * decode max-num-seqs: 64 (KV is 8x larger per request than 1k/1k; -# even with TP=8 sharding, conservative max-num-seqs vs the 128 in -# the 1k/1k sibling. 4 workers x 64 = 256 simultaneous, plenty for -# the conc=128 max in this entry.) - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 8 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - # TP=8 across 8 GPUs (one node per worker). No data-parallel-size. - tensor-parallel-size: 8 - pipeline-parallel-size: 1 - enable-expert-parallel: true - max-model-len: auto - max-num-seqs: 64 - max-cudagraph-capture-size: 64 - max-num-batched-tokens: 64 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x16x64x128" - req_rate: "inf" - use_chat_template: false From 7c8b85919bb2074726a3d31cae83f4f1c4b56373 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 18:12:40 -0700 Subject: [PATCH 23/28] Adopt NVIDIA aflowers/gb200-dsv4-recipes 1p1d-dep8-tep8 for low conc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the NVIDIA-official TEP recipe for very low concurrency: https://github.com/NVIDIA/srt-slurm/blob/aflowers/gb200-dsv4-recipes/ recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml Topology: 1 prefill (DP=8) + 1 decode (TP=8) — 4 nodes. Adds 1k/1k sibling (no upstream equivalent) by shrinking max-model-len to 3072. Local deviations from upstream (documented in recipe headers): * model.path renamed deepseekv4-fp4 -> deepseek-v4-pro to match our launch script's SRT_SLURM_MODEL_PREFIX. * Stripped CPU/DRAM offload knobs and numa-bind (our pinned NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the vllm_numa_bind_hash_fix.py patch upstream uses). * benchmark.use_chat_template: false (no PR #68 sa-bench changes in our srtctl); benchmark.tokenizer_mode dropped for the same reason. * Container kept on the floating tag; health_check + slurm.time_limit added for cold-cache Lustre loads. Replaces the 1p4d-dep8-dep8 low-conc entries (10-node, 4 decode workers) with this 4-node TEP topology in both 1k/1k (active) and 8k/1k (still commented). Deletes the now-unused 1p4d-dep8-dep8 recipe files. Active 1k/1k sweep: 3 entries / 14 benchmark points. --- .github/configs/nvidia-master.yaml | 31 ++-- ....yaml => disagg-gb200-1p1d-dep8-tep8.yaml} | 72 +++++---- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 149 ++++++++++++++++++ .../8k1k/disagg-gb200-1p4d-dep8-dep8.yaml | 112 ------------- 4 files changed, 206 insertions(+), 158 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/{disagg-gb200-1p4d-dep8-dep8.yaml => disagg-gb200-1p1d-dep8-tep8.yaml} (54%) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 97ebc9c67..91b771a67 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7530,22 +7530,22 @@ dsv4-fp4-gb200-dynamo-vllm: - isl: 1024 osl: 1024 search-space: - # Interactivity: 1 prefill (DP=8) + 4 decodes (DP=8 each). 10 nodes. - # NOTE: conc-list was [4, 8, 16, 32, 64, 128]; 4/16/32 dropped to shorten - # sweep runtime. Re-add them together with the 8k/1k block below. - - conc-list: [8, 64, 128] + # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). + # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch + # 1p1d-dep8-tep8.yaml (offload + numa-bind stripped — see recipe header). + - conc-list: [1, 4, 8, 16, 32, 64] prefill: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml" + - "CONFIG_FILE=recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" decode: - num-worker: 4 + num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - conc-list: [256, 512, 1024, 2048, 3072, 4096] @@ -7587,22 +7587,21 @@ dsv4-fp4-gb200-dynamo-vllm: # - isl: 8192 # osl: 1024 # search-space: - # # Interactivity: 1 prefill (DP=8) + 4 separate decodes (DP=8 each). - # # 10 nodes total. Low TTFT/TPOT focus. - # # NOTE: conc-list was [4, 8, 16, 32, 128]; 4/16/32 dropped. - # - conc-list: [8, 128] + # # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). + # # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. + # - conc-list: [1, 4, 8, 16, 32, 64] # prefill: # num-worker: 1 # tp: 8 # ep: 8 # dp-attn: true # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml" + # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" # decode: - # num-worker: 4 + # num-worker: 1 # tp: 8 - # ep: 8 - # dp-attn: true + # ep: 1 + # dp-attn: false # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. # - conc-list: [512, 1024] # prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml similarity index 54% rename from benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 59427712c..c25de42a0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -1,16 +1,15 @@ -name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8" +name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8" -# 1k/1k interactivity variant of the 8k/1k recipe with the same name (under -# ../8k1k/). Extrapolated from kimi-k2.5/1k1k/disagg-gb200-1p4d-dep4-tep4.yaml -# adjusted for DSV4-Pro's DP>=8 minimum (kimi uses TP=4, we use DP=8 per -# worker since model layers don't fit at smaller GPU counts). +# 1k/1k variant of NVIDIA's 8k/1k 1p1d-dep8-tep8 recipe (mirrored from +# aflowers/gb200-dsv4-recipes branch). Same topology and tuning; only +# max-model-len shrinks from 9280 (8k+1k+pad) to 3072 (1k+1k+pad). No +# upstream NVIDIA reference for DSV4-Pro 1k/1k vLLM disagg yet. # -# Differences from our 8k1k 1p4d-dep8-dep8: -# * max-model-len: 3072 (1024 + 1024 + 1024 headroom) instead of auto/10240 -# * prefill max-num-seqs: 16 instead of 2 (1k prompts fit 16/batch within -# the same 16384 max-num-batched-tokens budget) -# * decode max-num-seqs: 128 instead of 32 (shorter KV = more headroom) -# * max-cudagraph-capture-size / max-num-batched-tokens (decode): 128 +# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets +# very low concurrency (1-64). +# +# Local deltas vs upstream 8k/1k sibling: same as the 8k/1k recipe — see +# ../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full deviation list. model: path: "deepseek-v4-pro" @@ -23,16 +22,9 @@ dynamo: setup_script: vllm-container-deps.sh -# Also set slurm.time_limit explicitly (above srtslurm.yaml's 6h default) so -# a slow first-time Lustre load + cudagraph capture can't get cut off by the -# SLURM wall clock. slurm: time_limit: "8:00:00" -# Bumped from the 1800s default to 4 hours. DSV4-Pro weights load slowly from -# Lustre with multiple workers contending for the same OSTs — previous 1k/1k -# run hit the default 1800s. Make this *very* generous since the cost of an -# over-long deadline is just sitting idle, not wasted compute. health_check: max_attempts: 1440 interval_seconds: 10 @@ -41,9 +33,9 @@ resources: gpu_type: "gb200" gpus_per_node: 4 prefill_nodes: 2 - decode_nodes: 8 + decode_nodes: 2 prefill_workers: 1 - decode_workers: 4 + decode_workers: 1 gpus_per_prefill: 8 gpus_per_decode: 8 @@ -56,20 +48,38 @@ backend: connector: null prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" TILELANG_CLEANUP_TEMP_FILES: "1" VLLM_USE_NCCL_SYMM_MEM: "1" NCCL_CUMEM_ENABLE: "1" NCCL_MNNVL_ENABLE: "1" NCCL_NVLS_ENABLE: "1" VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL vllm_config: prefill: @@ -84,41 +94,43 @@ backend: enforce-eager: true max-model-len: 3072 max-num-seqs: 16 - max-num-batched-tokens: 16384 + max-num-batched-tokens: 32768 trust-remote-code: true no-enable-prefix-caching: true no-enable-flashinfer-autotune: true + no-async-scheduling: true block-size: 256 - gpu-memory-utilization: 0.88 + gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true + tokenizer-mode: deepseek_v4 decode: kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' served-model-name: "deepseek-ai/DeepSeek-V4-Pro" kv-cache-dtype: "fp8" - tensor-parallel-size: 1 + tensor-parallel-size: 8 pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 enable-expert-parallel: true max-model-len: 3072 - max-num-seqs: 128 - max-cudagraph-capture-size: 128 - max-num-batched-tokens: 128 + max-num-seqs: 64 + max-cudagraph-capture-size: 64 + max-num-batched-tokens: 64 trust-remote-code: true no-enable-prefix-caching: true block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + attention-config: '{"use_fp4_indexer_cache":true}' + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' gpu-memory-utilization: 0.9 stream-interval: 50 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true + tokenizer-mode: deepseek_v4 benchmark: type: "sa-bench" isl: 1024 osl: 1024 - concurrencies: "8x32x64x128" + concurrencies: "1x4x8x16x32x64" req_rate: "inf" use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml new file mode 100644 index 000000000..1cf645e52 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -0,0 +1,149 @@ +name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8" + +# Mirrored from NVIDIA/srt-slurm aflowers/gb200-dsv4-recipes branch: +# recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +# +# Topology: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes total. Targets +# very low concurrency (1-64) where TEP-style decode (TP-sharded +# attention + EP'd experts within one worker) gives the best per-user +# latency. +# +# Local deltas vs upstream: +# * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match +# our launch script's SRT_SLURM_MODEL_PREFIX. +# * CPU/DRAM offload knobs (offload-group-size / -num-in-group / +# -prefetch-step / # offload-params) and numa-bind dropped — our +# clone is NVIDIA/srt-slurm@sa-submission-q2-2026 which doesn't ship +# the vllm_numa_bind_hash_fix.py patch. +# * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode +# dropped. Both require PR #68 sa-bench tokenizer support that our +# pinned srtctl version doesn't have. The recipe-level +# `tokenizer-mode: deepseek_v4` for workers stays. +# * Container kept on the floating tag (`:deepseekv4-cu130`) instead of +# the upstream sha256 pin. +# * health_check / slurm.time_limit added — we observed cold-cache +# Lustre loads exceeding the default 1800s deadline. + +model: + path: "deepseek-v4-pro" + container: "vllm/vllm-openai:deepseekv4-cu130" + precision: "fp4" + +dynamo: + hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024" + VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_SERVER_DEV_MODE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + UCX_MEMTYPE_CACHE: "n" + UCX_MEMTYPE_REG_WHOLE: "n" + UCX_TLS: "cuda_copy,cuda_ipc,tcp" + UCX_CUDA_IPC_ENABLE_MNNVL: "y" + NCCL_P2P_LEVEL: NVL + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9280 + max-num-seqs: 16 + max-num-batched-tokens: 32768 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.8 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 9280 + max-num-seqs: 64 + max-cudagraph-capture-size: 64 + max-num-batched-tokens: 64 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + attention-config: '{"use_fp4_indexer_cache":true}' + compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + enable-sleep-mode: true + tokenizer-mode: deepseek_v4 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml deleted file mode 100644 index 0b000b8e3..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p4d-dep8-dep8.yaml +++ /dev/null @@ -1,112 +0,0 @@ -name: "dsv4-vllm-disagg-gb200-1p4d-dep8-dep8" - -# Interactivity-focused topology: 1 prefill worker + 4 separate decode -# workers, each at DP=8. Targets conc 4-128 where TTFT/TPOT matter more -# than aggregate throughput. Same per-worker vllm_config as the NVIDIA -# 7p1d reference (PR #67); only resources, decode capacity (max-num-seqs -# / cudagraph capture / batched-tokens), and benchmark concurrencies -# differ. - -model: - path: "deepseek-v4-pro" - container: "vllm/vllm-openai:deepseekv4-cu130" - precision: "fp4" - -dynamo: - hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b - install: true - -setup_script: vllm-container-deps.sh - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 8 - prefill_workers: 1 - decode_workers: 4 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: vllm - connector: null - - prefill_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - decode_environment: - TILELANG_CLEANUP_TEMP_FILES: "1" - VLLM_USE_NCCL_SYMM_MEM: "1" - NCCL_CUMEM_ENABLE: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_NVLS_ENABLE: "1" - VLLM_SERVER_DEV_MODE: "1" - - vllm_config: - prefill: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - enforce-eager: true - max-model-len: auto - max-num-seqs: 2 - max-num-batched-tokens: 16384 - trust-remote-code: true - no-enable-prefix-caching: true - no-enable-flashinfer-autotune: true - block-size: 256 - gpu-memory-utilization: 0.88 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - - decode: - kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - kv-cache-dtype: "fp8" - tensor-parallel-size: 1 - pipeline-parallel-size: 1 - data-parallel-size: 8 - data-parallel-rpc-port: 13345 - enable-expert-parallel: true - max-model-len: auto - max-num-seqs: 32 - max-cudagraph-capture-size: 32 - max-num-batched-tokens: 32 - trust-remote-code: true - no-enable-prefix-caching: true - block-size: 256 - compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' - gpu-memory-utilization: 0.9 - stream-interval: 50 - no-disable-hybrid-kv-cache-manager: true - enable-sleep-mode: true - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4x8x32x64" - req_rate: "inf" - use_chat_template: false From 42d9107fcb6c091ec58096b21f3893e30e1755db Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 18:33:30 -0700 Subject: [PATCH 24/28] Re-add CPU/DRAM offload to 1p1d-dep8-tep8 recipes (load-bearing) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Last run failed with "Available KV cache memory: -15.99 GiB" on every prefill rank — model weights + activations alone exceed the gpu-memory-utilization=0.8 budget by ~16 GB at DP=8 (full attention replicated per rank + 1/8 of FP4 experts). The upstream recipe ships with offload precisely to free that ~16 GB by spilling MoE expert weights to host DRAM. Restores the three offload knobs on prefill in both 1k/1k and 8k/1k: offload-group-size: 3 offload-num-in-group: 1 offload-prefetch-step: 2 numa-bind: true is still excluded — needs the configs/patches/vllm_numa_bind_hash_fix.py patch that our pinned NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship. Offload works without it (just slower host-side bandwidth). --- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 8 ++++++++ .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 16 ++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index c25de42a0..984c79526 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -103,6 +103,14 @@ backend: gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true + # CPU/DRAM expert offload — required for fit. Without these the prefill + # rank reports `Available KV cache memory: -16 GiB` and the engine + # refuses to start. Numa-bind from upstream is still off because our + # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the + # vllm_numa_bind_hash_fix.py patch. + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 tokenizer-mode: deepseek_v4 decode: diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 1cf645e52..0c872e9c4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -11,10 +11,10 @@ name: "dsv4-vllm-disagg-gb200-1p1d-dep8-tep8" # Local deltas vs upstream: # * model.path alias renamed deepseekv4-fp4 -> deepseek-v4-pro to match # our launch script's SRT_SLURM_MODEL_PREFIX. -# * CPU/DRAM offload knobs (offload-group-size / -num-in-group / -# -prefetch-step / # offload-params) and numa-bind dropped — our -# clone is NVIDIA/srt-slurm@sa-submission-q2-2026 which doesn't ship -# the vllm_numa_bind_hash_fix.py patch. +# * numa-bind dropped — our clone is NVIDIA/srt-slurm@sa-submission-q2-2026 +# which doesn't ship the vllm_numa_bind_hash_fix.py patch. CPU/DRAM +# expert offload (offload-group-size/-num-in-group/-prefetch-step) is +# KEPT — it's load-bearing here, see the comment in vllm_config.prefill. # * benchmark.use_chat_template: true -> false; benchmark.tokenizer_mode # dropped. Both require PR #68 sa-bench tokenizer support that our # pinned srtctl version doesn't have. The recipe-level @@ -116,6 +116,14 @@ backend: gpu-memory-utilization: 0.8 no-disable-hybrid-kv-cache-manager: true enable-sleep-mode: true + # CPU/DRAM expert offload — required for fit. Without these the prefill + # rank reports `Available KV cache memory: -16 GiB` and the engine + # refuses to start. Numa-bind from upstream is still off because our + # NVIDIA/srt-slurm@sa-submission-q2-2026 clone doesn't ship the + # vllm_numa_bind_hash_fix.py patch. + offload-group-size: 3 + offload-num-in-group: 1 + offload-prefetch-step: 2 tokenizer-mode: deepseek_v4 decode: From 47d3cdc6b52df99b9963ffefe1ac7ac11fae3b49 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 19:34:41 -0700 Subject: [PATCH 25/28] PR review fixes: harden cp -rT, refresh stale changelog description MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * runners/launch_gb200-nv.sh: switch the recipe overlay step from `cp -r src dst` to `cp -rT src dst` (with explicit `mkdir -p dst` first). Addresses the bot review nit at line 144 — `cp -r src dst` works only because the upstream sa-submission-q2-2026 branch has no `recipes/vllm/deepseek-v4/` directory today; if upstream ever ships one, `cp -r` would nest as `recipes/vllm/deepseek-v4/deepseek-v4/...` and CONFIG_FILE in nvidia-master.yaml would silently resolve to the upstream stub. `-T` overlays unconditionally. * perf-changelog.yaml: refresh the dsv4-fp4-gb200-dynamo-vllm entry's description. The previous wording referenced "8k1k, 7p1d-dep8-dep16" and "Mirrors NVIDIA/srt-slurm PR #67" which is stale after the move to a 1k/1k sweep with TEP low-conc (mirrored from PR #71) plus two hand-rolled mid/high topologies. Also fixes the directory reference (recipes moved to benchmarks/multi_node/srt-slurm-recipes/ during the cleanup pass). --- perf-changelog.yaml | 7 ++++--- runners/launch_gb200-nv.sh | 7 ++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d1f83e721..453488420 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1779,7 +1779,8 @@ - config-keys: - dsv4-fp4-gb200-dynamo-vllm description: - - "Add DeepSeek V4 Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (8k1k, 7p1d-dep8-dep16)" - - "Container: vllm/vllm-openai:deepseekv4-cu130" - - "Mirrors NVIDIA/srt-slurm PR #67; recipes stored in srt-slurm-recipes/ and copied into srt-slurm checkout at runtime" + - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated vLLM benchmarks via Dynamo (1k/1k sweep; 8k/1k currently commented out)" + - "Container: vllm/vllm-openai:deepseekv4-cu130; model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" + - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern" + - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 45d49c09b..224c3a928 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -144,7 +144,12 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout sa-submission-q2-2026 - cp -r "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 + # Use `cp -rT` so if the upstream branch ever ships a stub + # `recipes/vllm/deepseek-v4/` directory, we overlay our recipes onto + # it rather than nesting (`cp -r src dst` would create + # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case). + mkdir -p recipes/vllm/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" From 9cd8f7070f52b8a3467fc5e8b63350767e1f7286 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 19:35:17 -0700 Subject: [PATCH 26/28] activate 8k1k --- .github/configs/nvidia-master.yaml | 102 ++++++++++++++--------------- 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 91b771a67..33563fe25 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7577,57 +7577,51 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true - # --------------------------------------------------------------------- - # 8k/1k block — TEMPORARILY DISABLED to avoid re-running under the - # sweep-enabled gate while we collect 1k/1k data. Re-enable by - # uncommenting (remove the leading "# " on every line of the block - # below). The conc-lists already have 4/16/32 stripped — add them back - # together with the 1k/1k 1p4d block if you want the full sweep again. - # --------------------------------------------------------------------- - # - isl: 8192 - # osl: 1024 - # search-space: - # # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. - # - conc-list: [1, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: false - # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - # - conc-list: [512, 1024] - # prefill: - # num-worker: 3 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes - # # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - # - conc-list: [2048, 4096] - # prefill: - # num-worker: 7 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + + - isl: 8192 + osl: 1024 + search-space: + # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). + # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes + # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. + - conc-list: [2048, 4096] + prefill: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true From 980b77749c31c33a751f75e80b7a85fd44907f4d Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 19:39:26 -0700 Subject: [PATCH 27/28] Fix 8k/1k seq-len-config indent in nvidia-master.yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the 8k/1k block was uncommented, every line landed two spaces too deep — the block became a child of the 1k/1k entry's search-space list instead of a sibling under seq-len-configs. process_changelog.py's pydantic check caught this: seq-len-configs.0.search-space.3.prefill: Field required seq-len-configs.0.search-space.3.isl: Extra inputs are not permitted (The validator was reading the 8k/1k entry as a 4th search-space item that lacked prefill/decode and had stray isl/osl fields.) Dedented the entire 8k/1k block by 2 spaces. Schema validates, matrix expansion produces 6 entries / 24 benchmark points across 1k/1k + 8k/1k. --- .github/configs/nvidia-master.yaml | 94 +++++++++++++++--------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 33563fe25..3604e249e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7578,50 +7578,50 @@ dsv4-fp4-gb200-dynamo-vllm: ep: 16 dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). - # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. - - conc-list: [1, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. - - conc-list: [512, 1024] - prefill: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes - # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - - conc-list: [2048, 4096] - prefill: - num-worker: 7 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). + # 4 nodes total. Mirrors NVIDIA aflowers/gb200-dsv4-recipes branch. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes total. + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes + # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. + - conc-list: [2048, 4096] + prefill: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true From d1349b2e0807087ecec39400b082fc6bb63f8e95 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Fri, 24 Apr 2026 19:43:26 -0700 Subject: [PATCH 28/28] Align matrix conc-lists to recipe concurrencies (recipe is source of truth) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The workflow only exports CONFIG_FILE to srtctl and doesn't rewrite the recipe's benchmark.concurrencies block — so what actually runs is determined by the recipe, while the matrix conc-list only drives job naming and result aggregation. When the two disagree the matrix labels end up wrong (some advertised concs never run; runs land under mismatched labels). Two mismatches caught by audit: 1k/1k 1p1d-dep8-dep16: matrix [256, 512, 1024, 2048, 3072, 4096] -> [128, 256, 1024, 2048, 4096] recipe stays 128x256x1024x2048x4096 8k/1k 7p1d-dep8-dep16: matrix [2048, 4096] -> [4096, 8192] recipe stays 4096x8192 Picked recipe-side as the source of truth so the recipes stay self-consistent; matrix labels now reflect what srtctl will actually run. --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3604e249e..eab500d25 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7548,7 +7548,7 @@ dsv4-fp4-gb200-dynamo-vllm: dp-attn: false # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). # 6 nodes. Single prefill is plenty for 1k prompts up to ~conc 4096. - - conc-list: [256, 512, 1024, 2048, 3072, 4096] + - conc-list: [128, 256, 1024, 2048, 4096] prefill: num-worker: 1 tp: 8 @@ -7612,7 +7612,7 @@ dsv4-fp4-gb200-dynamo-vllm: dp-attn: true # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes # (full cluster). Mirrors NVIDIA/srt-slurm PR #67. - - conc-list: [2048, 4096] + - conc-list: [4096, 8192] prefill: num-worker: 7 tp: 8