From 93db2e2b3f9f99ac86c7d2f28cc5b718b62661de Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 13:00:49 -0700 Subject: [PATCH 01/56] Day 0 DeepSeek V4 Pro FP4 GB200 disaggregated SGLang benchmarks --- .github/configs/nvidia-master.yaml | 112 +++++++++++++++++ .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 110 +++++++++++++++++ .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 115 ++++++++++++++++++ .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 111 +++++++++++++++++ .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 106 ++++++++++++++++ .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 109 +++++++++++++++++ .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 110 +++++++++++++++++ perf-changelog.yaml | 9 ++ runners/launch_gb200-nv.sh | 16 +++ 9 files changed, 798 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 42c720a63..b2d361f65 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7666,3 +7666,115 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 16 ep: 16 dp-attn: true + +dsv4-fp4-gb200-dynamo-sglang: + image: lmsysorg/sglang:deepseek-v4-grace-blackwell + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + # 1k/1k — hand-rolled. NVIDIA/srt-slurm has no DSV4 sglang disagg + # recipe yet; topologies match the dsv4-fp4-gb200-dynamo-vllm sibling + # so framework-level numbers are directly comparable. Per-worker + # tunings cross-reference benchmarks/single_node/dsv4_fp4_b200.sh and + # NVIDIA/srt-slurm@sa-submission-q2-2026 recipes/gb200-fp4/1k1k/*.yaml + # (DSR1 sglang disagg structure). + - isl: 1024 + osl: 1024 + search-space: + # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes. + - conc-list: [128, 256, 1024, 2048, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. + # 4096 overlap with the 1p1d block gives a topology-crossover A/B. + - conc-list: [4096, 8192] + prefill: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + + # 8k/1k block kept commented out — same rationale as the dsv4-fp4- + # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded. + # Uncomment to re-enable (recipes are already in place). + # - isl: 8192 + # osl: 1024 + # search-space: + # # Low-concurrency: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes. + # - conc-list: [1, 4, 8, 16, 32, 64] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: false + # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. + # - conc-list: [512, 1024] + # prefill: + # num-worker: 3 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes. + # - conc-list: [4096, 8192] + # prefill: + # num-worker: 7 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml new file mode 100644 index 000000000..6eecc801b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -0,0 +1,110 @@ +name: "dsv4-sglang-disagg-gb200-1p1d-dep8-dep16" + +# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the +# upstream-reference list (PR #69 GB200 agg, PR #75 GB300 disagg). +# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. +# +# Topology: 1 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 6 nodes. +# Single prefill is enough for 1k prompts up to ~conc 4096 (per-rank +# prefill TFlops at 1k ISL is high; matches the vLLM sibling sizing). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 16 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 512 + cuda-graph-max-bs: 512 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256x1024x2048x4096" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml new file mode 100644 index 000000000..5c44400e3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -0,0 +1,115 @@ +name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" + +# Hand-rolled — no GB200 DSV4 sglang disagg recipe exists upstream. The +# closest references on NVIDIA/srt-slurm are: +# * PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml) — +# GB200 DSV4 sglang AGGREGATED: per-worker flag set + env vars. +# * PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml) — +# GB300 DSV4 sglang DISAGG: confirms nixl + flashinfer_mxfp4 + +# chunked-prefill-size=4096 + disable-flashinfer-autotune. +# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross- +# framework numbers stay directly comparable. +# +# Topology: 1 prefill (DP=8 EP=8) + 1 decode (TP=8, no DP-attn). 4 nodes. +# Targets very low concurrency (1-64) where TP-sharded decode gives the +# best per-user latency. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + # Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline + # that's actually been run upstream) plus the disaggregation timeout + # triple — heartbeat 100k matches the DSR1 sglang disagg convention. + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 16 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 64 + cuda-graph-max-bs: 64 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..bb61350b2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -0,0 +1,111 @@ +name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" + +# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the +# upstream-reference list. Topology mirrors the dsv4-fp4-gb200-dynamo- +# vllm sibling. +# +# Topology: 3 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 10 nodes. +# Sized for conc 4096-8192 — at those concurrencies a single prefill +# worker (the 1p1d-dep8-dep16 sibling) becomes the bottleneck since the +# 1k prefill arrival rate exceeds what one DP=8 worker can sustain. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 16 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 3072 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x8192" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml new file mode 100644 index 000000000..abe23d2dd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -0,0 +1,106 @@ +name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" + +# 8k/1k variant of the 1k/1k 1p1d-dep8-tep8 recipe. Same topology and +# tuning; only context-length grows from 3072 (1k+1k+pad) to 9280 +# (8k+1k+pad), and prefill max-running-requests halves to keep the per- +# rank prefill working set inside the GPU memory budget. +# +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full upstream- +# reference list (PR #69 GB200 agg, PR #75 GB300 disagg). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 8 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 64 + cuda-graph-max-bs: 64 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16x32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml new file mode 100644 index 000000000..bdbfaa735 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -0,0 +1,109 @@ +name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" + +# 8k/1k mid-throughput topology: 3 prefill (DP=8 EP=8) + 1 wide decode +# (DP=16 EP=16). 10 nodes. Targets conc 512-1024 — 8k prompts saturate +# a single prefill worker below conc=512. +# +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference +# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 6 + decode_nodes: 4 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 4 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 256 + cuda-graph-max-bs: 256 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512x1024" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml new file mode 100644 index 000000000..de9bd45df --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -0,0 +1,110 @@ +name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" + +# 8k/1k max-throughput topology: 7 prefill (DP=8 EP=8) + 1 wide decode +# (DP=16 EP=16). 18 nodes — full GB200 cluster. Targets conc 4096-8192. +# Per-worker tunings identical to the 3p1d sibling; only prefill_workers +# and prefill_nodes scale up. +# +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference +# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +dynamo: + version: 0.8.1 + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 14 + decode_nodes: 4 + prefill_workers: 7 + decode_workers: 1 + gpus_per_prefill: 8 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + connector: null + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 8 + dp-size: 8 + ep-size: 8 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 4 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "prefill" + disaggregation-transfer-backend: nixl + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + tensor-parallel-size: 16 + dp-size: 16 + ep-size: 16 + enable-dp-attention: true + moe-a2a-backend: "deepep" + moe-runner-backend: "flashinfer_mxfp4" + chunked-prefill-size: 4096 + disable-flashinfer-autotune: true + disable-radix-cache: true + mem-fraction-static: 0.82 + context-length: 9280 + max-running-requests: 256 + cuda-graph-max-bs: 256 + stream-interval: 50 + decode-log-interval: 1000 + disaggregation-mode: "decode" + disaggregation-transfer-backend: nixl + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096x8192" + req_rate: "inf" + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 397da6591..45bc466fc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1819,3 +1819,12 @@ - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 + +- config-keys: + - dsv4-fp4-gb200-dynamo-sglang + description: + - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)" + - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" + - "Topologies mirror the dsv4-fp4-gb200-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B" + - "No upstream GB200 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB200 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 224c3a928..08897874e 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -15,6 +15,12 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp4" + elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Same compute-node-local NVMe path as the dynamo-vllm dsv4 + # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX + # matches the model.path alias in our DSV4 sglang recipes. + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else export MODEL_PATH=$MODEL fi @@ -150,6 +156,16 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case). mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 +elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then + # Mirrors the dynamo-vllm dsv4 branch above: pin to the q2-2026 + # NVIDIA srt-slurm (newer srtctl + dynamo-sglang container alias) + # and overlay our hand-rolled DSV4 sglang recipes. NVIDIA/srt-slurm + # has no upstream sglang DSV4 disagg recipes yet, hence the overlay. + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 + mkdir -p recipes/sglang/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" From 1bc4c2e6929d098456e11557c5c0fb86423bad48 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 13:35:16 -0700 Subject: [PATCH 02/56] Drop unsupported backend.connector field from sglang recipes srtctl SrtConfig schema rejects backend.connector for the sglang backend type. The field was carried over from the dynamo-vllm dsv4 recipes (where it is valid and set to null). PR #69/#75 sglang recipes upstream do not declare it. --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 1 - .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 1 - .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 1 - .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 1 - .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 1 - .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 1 - 6 files changed, 6 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 6eecc801b..6a78c476a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -39,7 +39,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 5c44400e3..3da368c17 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -45,7 +45,6 @@ frontend: backend: type: sglang - connector: null # Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline # that's actually been run upstream) plus the disaggregation timeout diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index bb61350b2..12b1207bb 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -40,7 +40,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index abe23d2dd..54debefef 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -39,7 +39,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index bdbfaa735..f377c803e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -38,7 +38,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index de9bd45df..53b7661d6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -39,7 +39,6 @@ frontend: backend: type: sglang - connector: null prefill_environment: PYTHONUNBUFFERED: "1" From 65b8b1711de84af4c253df12512b1638108abb46 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 14:05:08 -0700 Subject: [PATCH 03/56] =?UTF-8?q?Drop=20dynamo:=20version:=200.8.1=20?= =?UTF-8?q?=E2=80=94=20incompatible=20with=20deepseek-v4-grace-blackwell?= =?UTF-8?q?=20sglang=20fork?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-installing dynamo 0.8.1 over the lmsysorg/sglang:deepseek-v4-grace-blackwell container's pre-baked sglang fails at import time: File ".../dynamo/sglang/health_check.py", line 20 def _get_bos_token_id_from_engine(engine: Optional[sgl.Engine]) AttributeError: module 'sglang' has no attribute 'Engine' The DSV4 sglang fork bundled in this image does not expose sgl.Engine. Drop the dynamo: block so srtctl uses the dynamo build pre-installed in the container — matches NVIDIA/srt-slurm PR #75 (the only upstream DSV4 sglang disagg recipe), which also has no dynamo: block. --- .../deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 7 +++++-- .../deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 9 +++++++-- .../deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 7 +++++-- .../deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 7 +++++-- .../deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 7 +++++-- .../deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 7 +++++-- 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 6a78c476a..f497da7fc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -13,8 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the +# rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 3da368c17..f616b553d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -19,8 +19,13 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — srtctl skips the dynamo pip install and uses the +# dynamo build baked into the lmsysorg/sglang:deepseek-v4-grace-blackwell +# image. dynamo 0.8.1 (the version pinned by upstream DSR1 sglang +# recipes) imports `sgl.Engine`, which this image's sglang fork does not +# expose, so re-installing it breaks startup with `AttributeError: +# module 'sglang' has no attribute 'Engine'`. PR #75 (the only upstream +# DSV4 sglang disagg recipe) follows the same pattern. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 12b1207bb..e382271b8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -14,8 +14,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the +# rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 54debefef..226565d55 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -13,8 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index f377c803e..6bb69816c 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -12,8 +12,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 53b7661d6..311482e37 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -13,8 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -dynamo: - version: 0.8.1 +# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale. srtctl uses the dynamo build baked into the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing +# dynamo 0.8.1 on top breaks startup with `AttributeError: module +# 'sglang' has no attribute 'Engine'`. slurm: time_limit: "8:00:00" From 9d883ba0d474fb76c022f286ee30bd59e6413802 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 14:11:23 -0700 Subject: [PATCH 04/56] =?UTF-8?q?Add=20dynamo:=20install:=20false=20?= =?UTF-8?q?=E2=80=94=20srtctl=20default=20is=20install=3DTrue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit srtctl's DynamoConfig (src/srtctl/core/schema.py L680) defaults to install=True, which pip installs dynamo 0.8.0 even when no `dynamo:` block is specified. Use the explicit opt-out so srtctl uses the dynamo build baked into the lmsysorg/sglang:deepseek-v4-grace-blackwell image. This image's sglang fork doesn't expose sgl.Engine, which dynamo.sglang.health_check imports at top level — re-installing dynamo over it breaks startup. --- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 10 +++++----- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 16 +++++++++------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 10 +++++----- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 10 +++++----- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 10 +++++----- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 10 +++++----- 6 files changed, 34 insertions(+), 32 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index f497da7fc..29f10cd1b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -13,11 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the -# rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the +# rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index f616b553d..e2cb204d9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -19,13 +19,15 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — srtctl skips the dynamo pip install and uses the -# dynamo build baked into the lmsysorg/sglang:deepseek-v4-grace-blackwell -# image. dynamo 0.8.1 (the version pinned by upstream DSR1 sglang -# recipes) imports `sgl.Engine`, which this image's sglang fork does not -# expose, so re-installing it breaks startup with `AttributeError: -# module 'sglang' has no attribute 'Engine'`. PR #75 (the only upstream -# DSV4 sglang disagg recipe) follows the same pattern. +# `install: false` is required: srtctl's DynamoConfig defaults to +# install=True (pip installs dynamo 0.8.0 from PyPI). dynamo's +# `dynamo.sglang.health_check` module imports `sgl.Engine` at top +# level, which the lmsysorg/sglang:deepseek-v4-grace-blackwell image's +# sglang fork does not expose — re-installing dynamo breaks startup +# with `AttributeError: module 'sglang' has no attribute 'Engine'`. +# Use whatever dynamo build is already baked into the container. +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index e382271b8..1c978deac 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -14,11 +14,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the -# rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the +# rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 226565d55..e2c15c775 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -13,11 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 6bb69816c..ddd061174 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -12,11 +12,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 311482e37..10dd11da0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -13,11 +13,11 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# No `dynamo:` block — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale. srtctl uses the dynamo build baked into the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image; pip-installing -# dynamo 0.8.1 on top breaks startup with `AttributeError: module -# 'sglang' has no attribute 'Engine'`. +# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for +# the rationale (srtctl defaults to installing dynamo 0.8.0, but that +# breaks against the deepseek-v4-grace-blackwell sglang fork). +dynamo: + install: false slurm: time_limit: "8:00:00" From 1b75dd7c4e122b21142ec3b12a6353da61d7229b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 14:39:18 -0700 Subject: [PATCH 05/56] Pin dynamo to v1.2.0-sglang-deepseek-v4-dev.1 tag (hash 21f135f5) install: false fixed the pip-install crash, but the lmsysorg/sglang:deepseek-v4-grace-blackwell image doesn't have dynamo pre-installed (ModuleNotFoundError: No module named 'dynamo'), so srtctl needs to install something compatible. The DSV4-targeted dynamo tag v1.2.0-sglang-deepseek-v4-dev.1 (sha 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b) includes 'from __future__ import annotations' in dynamo/sglang/health_check.py (ai-dynamo PR #7255, commit cdb7218a, 2026-03-12), which makes the Optional[sgl.Engine] annotation lazy. The PyPI 0.8.0/0.8.1 releases predate that fix and crash with AttributeError on this image's sglang fork. --- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 7 +++---- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 19 +++++++++++-------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 7 +++---- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 7 +++---- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 7 +++---- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 7 +++---- 6 files changed, 26 insertions(+), 28 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 29f10cd1b..06e692e67 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -13,11 +13,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the -# rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e2cb204d9..e7c639c2a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -19,15 +19,18 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` is required: srtctl's DynamoConfig defaults to -# install=True (pip installs dynamo 0.8.0 from PyPI). dynamo's -# `dynamo.sglang.health_check` module imports `sgl.Engine` at top -# level, which the lmsysorg/sglang:deepseek-v4-grace-blackwell image's -# sglang fork does not expose — re-installing dynamo breaks startup -# with `AttributeError: module 'sglang' has no attribute 'Engine'`. -# Use whatever dynamo build is already baked into the container. +# Pin dynamo to the v1.2.0-sglang-deepseek-v4-dev.1 tag. The PyPI +# 0.8.0/0.8.1 releases (srtctl's default) reference `sgl.Engine` in +# `dynamo.sglang.health_check` *eagerly* (no `from __future__ import +# annotations`), and the lmsysorg/sglang:deepseek-v4-grace-blackwell +# image's sglang fork does not expose `sgl.Engine`, so they crash at +# import with `AttributeError: module 'sglang' has no attribute +# 'Engine'`. The DSV4-targeted tag adds `from __future__ import +# annotations` (commit cdb7218a, ai-dynamo PR #7255), making the +# annotation lazy so the module imports cleanly. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 1c978deac..3011347db 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -14,11 +14,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ./disagg-gb200-1p1d-dep8-tep8.yaml for the -# rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e2c15c775..61e024a14 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -13,11 +13,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index ddd061174..7338cdaf3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -12,11 +12,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 10dd11da0..111f9e435 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -13,11 +13,10 @@ model: container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" precision: "fp4" -# `install: false` — see ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for -# the rationale (srtctl defaults to installing dynamo 0.8.0, but that -# breaks against the deepseek-v4-grace-blackwell sglang fork). +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - install: false + hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b + install: true slurm: time_limit: "8:00:00" From eb3f62c3dbf734fa5ed54d8e73a538e89453b186 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 15:40:48 -0700 Subject: [PATCH 06/56] Force deepep-mode: low_latency to work around mxfp4+DeepEP normal-dispatch bug Prefill warmup crashed in run 24941291328 with: File ".../sglang/srt/layers/quantization/mxfp4_deepseek.py", line 347 topk_output = dispatch_output.topk_output AttributeError: 'DeepEPNormalDispatchOutput' object has no attribute 'topk_output' Per sglang server_args.py, --deepep-mode defaults to 'auto', which picks 'normal' for prefill batches and 'low_latency' for decode. The mxfp4_deepseek MoE kernel only handles the low_latency dispatch output shape (which carries topk_output); the normal-dispatch output type does not, so any prefill forward (or decode warmup using forward_idle) hits the AttributeError before the worker can serve. Force deepep-mode: low_latency on every prefill + decode block that uses moe-a2a-backend: deepep. The two 1p1d-dep8-tep8 decode blocks remain TP-only (no DeepEP) and are unaffected. Run reference: https://github.com/SemiAnalysisAI/InferenceX/actions/runs/24941291328 --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 1 + .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 1 + .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 ++ 6 files changed, 10 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 06e692e67..f6e0144c0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -69,6 +69,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -89,6 +90,7 @@ backend: ep-size: 16 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e7c639c2a..4a56f1556 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -86,6 +86,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 3011347db..c676f1618 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -70,6 +70,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -90,6 +91,7 @@ backend: ep-size: 16 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 61e024a14..e15e24d12 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -69,6 +69,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 7338cdaf3..290d600ef 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -68,6 +68,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -88,6 +89,7 @@ backend: ep-size: 16 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 111f9e435..05f289815 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -69,6 +69,7 @@ backend: ep-size: 8 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -89,6 +90,7 @@ backend: ep-size: 16 enable-dp-attention: true moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 6c608dfa33451789fca8115f7c4e475b608162a2 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:02:31 -0700 Subject: [PATCH 07/56] =?UTF-8?q?Drop=20DeepEP=20/=20DP-attn=20/=20EP=20?= =?UTF-8?q?=E2=80=94=20fork-only=20mxfp4=5Fdeepseek=20bug,=20both=20dispat?= =?UTF-8?q?ch=20types=20broken?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run after the deepep-mode: low_latency change failed again. Logs show two distinct DeepEP-path failures: 1. Prefill scheduler crash: File '.../sglang/srt/layers/quantization/mxfp4_deepseek.py', line 347 topk_output = dispatch_output.topk_output AttributeError: 'DeepEPLLDispatchOutput' object has no attribute 'topk_output' The earlier crash had 'DeepEPNormalDispatchOutput' — neither dispatch output type in this image's sglang fork exposes topk_output, so forcing low_latency vs normal mode does not help. mxfp4_deepseek.py is a fork-only file (does not exist in upstream sgl-project/sglang), so the API mismatch can only be fixed by rebuilding the image. 2. Decode CUDA graph capture crash: RuntimeError: Failed: Assertion error /sgl-workspace/DeepEP/csrc/deep_ep.cpp:1233 'x.size(0) == topk_idx.size(0) and x.size(0) <= num_max_dispatch_tokens_per_rank' DeepEP low_latency_dispatch's per-rank token cap is exceeded by the cuda-graph-max-bs we configured. Both failures are in the DeepEP path. Per upstream sgl-project/sglang (server_args.py), moe_a2a_backend defaults to 'none', which uses all-reduce/all-gather dispatch and lets TP shard the expert weights across ranks (no separate EP needed). NVIDIA/srt-slurm PR #75 (the only upstream DSV4 sglang disagg recipe) takes the same TP-only stance — pure tensor-parallel-size: N with no enable-dp-attention, no moe-a2a-backend deepep, no dp-size, no ep-size. Drop those five fields from all 6 recipes. Topology shape preserved: - 1k1k 1p1d: P TP=8 / D TP=8 (4 nodes) - 1k1k 1p1d-wide: P TP=8 / D TP=16 (6 nodes) - 1k1k 3p1d-wide: P 3*TP=8 / D TP=16 (10 nodes) - 8k1k 1p1d: P TP=8 / D TP=8 (4 nodes) - 8k1k 3p1d-wide: P 3*TP=8 / D TP=16 (10 nodes) - 8k1k 7p1d-wide: P 7*TP=8 / D TP=16 (18 nodes) DSV4-Pro at MXFP4 (~340 GB) shards comfortably under TP=8 (~42 GB/rank) or TP=16 (~21 GB/rank) with mem-fraction-static: 0.82 leaving plenty of KV cache headroom on each 96 GB GB200 GPU. Topology filenames retain the 'dep8' / 'dep16' historical names from the vLLM PR #1129 sibling for symmetry — the actual sglang_config is TP-only. --- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 10 -------- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 23 ++++++++++++------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 10 -------- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 5 ---- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 10 -------- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 10 -------- 6 files changed, 15 insertions(+), 53 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index f6e0144c0..33f33fa92 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -65,11 +65,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -86,11 +81,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 - dp-size: 16 - ep-size: 16 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 4a56f1556..917d26dc6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -10,9 +10,21 @@ name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" # Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross- # framework numbers stay directly comparable. # -# Topology: 1 prefill (DP=8 EP=8) + 1 decode (TP=8, no DP-attn). 4 nodes. -# Targets very low concurrency (1-64) where TP-sharded decode gives the -# best per-user latency. +# Topology: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. Targets very +# low concurrency (1-64). +# +# Why TP-only (no DeepEP, no DP-attention, no EP): the +# lmsysorg/sglang:deepseek-v4-grace-blackwell image's sglang fork ships +# a fork-only quant kernel `mxfp4_deepseek.py` (does not exist in +# upstream sgl-project/sglang) that reads `dispatch_output.topk_output` +# at line 347. Neither `DeepEPNormalDispatchOutput` nor +# `DeepEPLLDispatchOutput` exposes that field in this fork, so any +# `forward_deepep` path in disagg crashes the prefill scheduler. PR #75 +# (the only upstream DSV4 sglang disagg recipe) takes the same TP-only +# stance — defaults to `moe_a2a_backend="none"` (sglang server_args.py) +# and lets TP shard the expert weights instead of sharding via EP. +# We can re-introduce EP/DeepEP once the fork's mxfp4_deepseek dispatch +# API mismatch is fixed. model: path: "deepseek-v4-pro" @@ -82,11 +94,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index c676f1618..5049d6f7d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -66,11 +66,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -87,11 +82,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 - dp-size: 16 - ep-size: 16 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e15e24d12..2cf890688 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -65,11 +65,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 290d600ef..6b4cb46ab 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -64,11 +64,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -85,11 +80,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 - dp-size: 16 - ep-size: 16 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 05f289815..fc9790730 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -65,11 +65,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 - dp-size: 8 - ep-size: 8 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -86,11 +81,6 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 - dp-size: 16 - ep-size: 16 - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 2bb3ef073a5ae669dd4f2896947ea5c6bbbbd195 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:21:08 -0700 Subject: [PATCH 08/56] =?UTF-8?q?Add=20moe-dense-tp-size:=201=20=E2=80=94?= =?UTF-8?q?=20fix=20shared-experts=20FP8=20block-quant=20divisibility=20at?= =?UTF-8?q?=20TP=3D8/16?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the DeepEP removal, model load crashed at: File '.../sglang/srt/layers/quantization/fp8.py', line 282, in validate_block_quant_shapes raise ValueError( ValueError: Weight output_partition_size = 192 is not divisible by weight quantization block_n = 128. DSV4-Pro's shared-experts gate_up_proj (intermediate ~1536) FP8-quants in 128-element blocks. With TP=8 the per-rank slice is 1536/8=192, which fails the divisibility check. PR #75 sidesteps this by using TP=4 (1536/4=384), but that locks us into single-node workers. sglang's --moe-dense-tp-size flag is the documented workaround (server_args.py: 'useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports'). Setting moe-dense-tp-size: 1 runs the shared / dense-MLP layers replicated across ranks (TP=1) while the rest of the model — attention, routed experts — keeps TP=8/16. Memory cost is small since shared experts are a fraction of total weights. Applied to all 6 recipes; topology/node counts unchanged. --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 ++ 6 files changed, 12 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 33f33fa92..7081919fc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -65,6 +65,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -81,6 +82,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 917d26dc6..6c7df35e4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -94,6 +94,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -110,6 +111,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 5049d6f7d..9ddf19ee7 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -66,6 +66,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,6 +83,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 2cf890688..4112e4244 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -65,6 +65,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -81,6 +82,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 6b4cb46ab..d9f43773f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -64,6 +64,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -80,6 +81,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index fc9790730..5887e85b1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -65,6 +65,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 8 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -81,6 +82,7 @@ backend: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true tensor-parallel-size: 16 + moe-dense-tp-size: 1 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From d34d894ef814cc5eb584d821c4bff1cd95d10a85 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:24:04 -0700 Subject: [PATCH 09/56] Set SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1024 in all env blocks Belt-and-suspenders for the DeepEP per-rank dispatch buffer cap. The default is too low; with this set we'll have headroom if EP / DeepEP is re-enabled later (e.g., once the fork's mxfp4_deepseek dispatch API mismatch is fixed). 1024 matches the cookbook's B200 decode reference. --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 ++ 6 files changed, 12 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 7081919fc..4a6397649 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -50,6 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,6 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 6c7df35e4..cc67a2cb6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -79,6 +79,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -88,6 +89,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 9ddf19ee7..6a4258a8a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -51,6 +51,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,6 +61,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 4112e4244..8024a769f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -50,6 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,6 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index d9f43773f..4d997ec99 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -49,6 +49,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -58,6 +59,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 5887e85b1..ac26318aa 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -50,6 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,6 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: From c24f25bf4772f81f4bf48529f51a8254b92c7069 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:42:07 -0700 Subject: [PATCH 10/56] =?UTF-8?q?Switch=20to=20TP=3D4=20single-node=20?= =?UTF-8?q?=E2=80=94=20match=20PR=20#75=20verbatim,=20fix=20FP8=20block-qu?= =?UTF-8?q?ant?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run after moe-dense-tp-size: 1 added still hit: ValueError: Weight output_partition_size = 192 is not divisible by weight quantization block_n = 128. Verified in upstream sglang dp_attention.py (compute_dp_attention_local_info): if not enable_dp_attention: return tp_rank, tp_size, 0 # moe_dense_tp_size IGNORED The flag is only honored when enable_dp_attention=True. Since we already dropped DP-attention to avoid the fork's mxfp4_deepseek bug, moe-dense-tp-size: 1 was a no-op. Two valid paths: (a) re-enable DP-attention without DeepEP — speculative, never tested (b) drop to TP=4 — 1536/4=384 divides cleanly by 128, FP8 quant passes. Matches NVIDIA/srt-slurm PR #75 (the only verified- working DSV4 sglang disagg recipe upstream) verbatim. Going with (b). Recipes drop moe-dense-tp-size (no longer needed at TP=4) and switch tensor-parallel-size to 4 in both prefill+decode. gpus_per_prefill / gpus_per_decode drop to 4 (single GB200 node per worker). prefill_nodes / decode_nodes track worker counts. Topology shape (filenames keep historical dep8/dep16 naming for symmetry with the vLLM #1129 sibling; actual config is TP=4): - 1k1k 1p1d-tep8: P TP=4 / D TP=4 (2 nodes total) - 1k1k 1p1d-dep16: P TP=4 / D TP=4 (2 nodes total) — same shape, different conc - 1k1k 3p1d-dep16: P 3*TP=4 / D TP=4 (4 nodes) - 8k1k 1p1d-tep8: P TP=4 / D TP=4 (2 nodes) - 8k1k 3p1d-dep16: P 3*TP=4 / D TP=4 (4 nodes) - 8k1k 7p1d-dep16: P 7*TP=4 / D TP=4 (8 nodes) nvidia-master.yaml updated to match (tp: 4, ep: 1, dp-attn: false on every prefill+decode block — including the commented 8k/1k block). Also bumped SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK 1024 → 2048 in all env blocks (DeepEP path is dormant in this config but the env var is in place for re-enabling later). --- .github/configs/nvidia-master.yaml | 94 ++++++++++--------- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 18 ++-- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 18 ++-- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++-- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 18 ++-- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++-- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 18 ++-- 7 files changed, 97 insertions(+), 105 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b2d361f65..edc142380 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7677,58 +7677,62 @@ dsv4-fp4-gb200-dynamo-sglang: multinode: true disagg: true seq-len-configs: - # 1k/1k — hand-rolled. NVIDIA/srt-slurm has no DSV4 sglang disagg - # recipe yet; topologies match the dsv4-fp4-gb200-dynamo-vllm sibling - # so framework-level numbers are directly comparable. Per-worker - # tunings cross-reference benchmarks/single_node/dsv4_fp4_b200.sh and - # NVIDIA/srt-slurm@sa-submission-q2-2026 recipes/gb200-fp4/1k1k/*.yaml - # (DSR1 sglang disagg structure). + # 1k/1k — TP-only single-node workers (matches NVIDIA/srt-slurm PR #75 + # GB300 DSV4 sglang disagg, the only verified-working DSV4 sglang + # disagg recipe upstream). The lmsysorg/sglang:deepseek-v4-grace- + # blackwell image's sglang fork has a fork-only mxfp4_deepseek bug + # (does not exist in upstream sgl-project/sglang) that crashes the + # DeepEP path, and at TP=8 the shared-experts gate_up_proj fails + # FP8 block-quant divisibility (1536/8=192, not divisible by 128). + # TP=4 (1536/4=384) clears both — see recipe headers for the full chain. + # Filenames keep the historical 'dep8'/'dep16' tag for symmetry with + # the dsv4-fp4-gb200-dynamo-vllm sibling; the actual recipe is TP=4. - isl: 1024 osl: 1024 search-space: - # Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes. + # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. - conc-list: [1, 4, 8, 16, 32, 64] prefill: num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" decode: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false - # Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes. + # Mid throughput: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. - conc-list: [128, 256, 1024, 2048, 4096] prefill: num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - # High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. - # 4096 overlap with the 1p1d block gives a topology-crossover A/B. + tp: 4 + ep: 1 + dp-attn: false + # High throughput: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes. + # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. - conc-list: [4096, 8192] prefill: num-worker: 3 - tp: 8 - ep: 8 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true + tp: 4 + ep: 1 + dp-attn: false # 8k/1k block kept commented out — same rationale as the dsv4-fp4- # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded. @@ -7736,45 +7740,45 @@ dsv4-fp4-gb200-dynamo-sglang: # - isl: 8192 # osl: 1024 # search-space: - # # Low-concurrency: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes. + # # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. # - conc-list: [1, 4, 8, 16, 32, 64] # prefill: # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true + # tp: 4 + # ep: 1 + # dp-attn: false # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" # decode: # num-worker: 1 - # tp: 8 + # tp: 4 # ep: 1 # dp-attn: false - # # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes. + # # Mid: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes. # - conc-list: [512, 1024] # prefill: # num-worker: 3 - # tp: 8 - # ep: 8 - # dp-attn: true + # tp: 4 + # ep: 1 + # dp-attn: false # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" # decode: # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes. + # tp: 4 + # ep: 1 + # dp-attn: false + # # Max throughput: 7 prefills (TP=4) + 1 decode (TP=4). 8 nodes. # - conc-list: [4096, 8192] # prefill: # num-worker: 7 - # tp: 8 - # ep: 8 - # dp-attn: true + # tp: 4 + # ep: 1 + # dp-attn: false # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" # decode: # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + # tp: 4 + # ep: 1 + # dp-attn: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 4a6397649..2833331d1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 4 + prefill_nodes: 1 + decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,14 +60,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -83,8 +82,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 16 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index cc67a2cb6..8b9603422 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -54,12 +54,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 + prefill_nodes: 1 + decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -79,7 +79,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -89,14 +89,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -112,8 +111,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 6a4258a8a..3115a0317 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -29,12 +29,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 4 + prefill_nodes: 3 + decode_nodes: 1 prefill_workers: 3 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -51,7 +51,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -61,14 +61,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -84,8 +83,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 16 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 8024a769f..dd09ba086 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 + prefill_nodes: 1 + decode_nodes: 1 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,14 +60,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -83,8 +82,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 4d997ec99..5a4bf4927 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -27,12 +27,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 4 + prefill_nodes: 3 + decode_nodes: 1 prefill_workers: 3 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -49,7 +49,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,14 +59,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,8 +81,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 16 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index ac26318aa..b17d5e08f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 14 - decode_nodes: 4 + prefill_nodes: 7 + decode_nodes: 1 prefill_workers: 7 decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 16 + gpus_per_prefill: 4 + gpus_per_decode: 4 frontend: type: dynamo @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,14 +60,13 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -83,8 +82,7 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 16 - moe-dense-tp-size: 1 + tensor-parallel-size: 4 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 8316d3f1bc21c831fbb1153ebdfc0fcb87b96b32 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 16:56:02 -0700 Subject: [PATCH 11/56] Restore mi355x retry changelog entries clobbered by merge The merge of main into this branch (c0aec939) accidentally overwrote the two dsv4-fp8-mi355x-sglang retry entries (PR #1148 retry-pair tail and PR #1159 retry-pair) with duplicated copies of our own dsv4-fp4-gb200-dynamo-sglang entry. The process_changelog.py gate rejects deletions, so the workflow blocked. Restore the two mi355x entries verbatim from origin/main and keep a single copy of our dsv4 entry, appended after the restored mi355x block. perf-changelog.yaml diff vs origin/main is now additions-only. --- perf-changelog.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c0c907b88..5312db2fe 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1820,6 +1820,21 @@ - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh" + - "Bump --chunked-prefill-size from 4096 to 8192" + - "Retrigger dsv4-fp8-mi355x-sglang" + +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh" + - "Bump --chunked-prefill-size from 4096 to 8192" + - "Retrigger dsv4-fp8-mi355x-sglang" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1159 + - config-keys: - dsv4-fp4-gb200-dynamo-sglang description: From f089567835284074bf161e40e7d1b75a373da5bf Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sat, 25 Apr 2026 17:36:53 -0700 Subject: [PATCH 12/56] Switch back to TP=8: enable-dp-attention + moe-dense-tp-size: 1, no moe-a2a-backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TP=4 OOMed — DSV4-Pro at MXFP4 doesn't fit on a single GB200 node. Need TP=8 across 2 nodes (768 GB total). But TP=8 trips two issues that earlier rounds papered over: a) shared-experts gate_up_proj FP8 block-quant divisibility (1536/8=192, not a multiple of block_n=128) b) the lmsysorg/sglang:deepseek-v4-grace-blackwell fork's mxfp4_deepseek kernel crashes on every DeepEP forward path Single combo that solves both — verified in upstream sglang source: * enable-dp-attention: true + moe-dense-tp-size: 1 Runs dense / shared-MLP layers replicated (TP=1) — fixes (a). moe-dense-tp-size IS gated on enable_dp_attention=True per python/sglang/srt/layers/dp_attention.py (compute_dp_attention_local_info ignores it when DP-attn is off). * NO moe-a2a-backend set (default 'none') Lands the model on forward_normal instead of forward_deepep — avoids (b). Verified in deepseek_v2.py: _enable_a2a_moe = is_deepep | is_mooncake | is_nixl | is_mori | is_ascend_fuseep | is_flashinfer With backend='none' this is False and forward_normal runs. Recipes: tensor-parallel-size 4 → 8 (both prefill+decode); add moe-dense-tp-size: 1, enable-dp-attention: true, dp-size: 8 to every sglang_config block; gpus_per_prefill / gpus_per_decode 4 → 8; prefill_nodes / decode_nodes scale to workers × 2. nvidia-master.yaml mirrors: tp 4 → 8, dp-attn false → true on every prefill+decode block (active 1k/1k + commented 8k/1k). Topology shape restored to: - 1k1k 1p1d-* : 4 nodes (was 2) - 1k1k 3p1d-* : 8 nodes (was 4) - 8k1k 1p1d-* : 4 nodes (commented) - 8k1k 3p1d-* : 8 nodes (commented) - 8k1k 7p1d-* : 16 nodes (commented) --- .github/configs/nvidia-master.yaml | 86 ++++++++++--------- .../1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 18 ++-- .../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 57 +++++++----- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++-- .../8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 18 ++-- .../8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 18 ++-- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 18 ++-- 7 files changed, 143 insertions(+), 90 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index edc142380..272f32702 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7677,62 +7677,68 @@ dsv4-fp4-gb200-dynamo-sglang: multinode: true disagg: true seq-len-configs: - # 1k/1k — TP-only single-node workers (matches NVIDIA/srt-slurm PR #75 - # GB300 DSV4 sglang disagg, the only verified-working DSV4 sglang - # disagg recipe upstream). The lmsysorg/sglang:deepseek-v4-grace- - # blackwell image's sglang fork has a fork-only mxfp4_deepseek bug - # (does not exist in upstream sgl-project/sglang) that crashes the - # DeepEP path, and at TP=8 the shared-experts gate_up_proj fails - # FP8 block-quant divisibility (1536/8=192, not divisible by 128). - # TP=4 (1536/4=384) clears both — see recipe headers for the full chain. - # Filenames keep the historical 'dep8'/'dep16' tag for symmetry with - # the dsv4-fp4-gb200-dynamo-vllm sibling; the actual recipe is TP=4. + # 1k/1k — TP=8 (2 GB200 nodes per worker) with DP-attention but no + # DeepEP. The lmsysorg/sglang:deepseek-v4-grace-blackwell image's + # sglang fork has a fork-only mxfp4_deepseek kernel that crashes any + # DeepEP forward path (both DeepEPLLDispatchOutput and + # DeepEPNormalDispatchOutput lack the `topk_output` field the kernel + # reads). At TP=8 the shared-experts gate_up_proj would also fail + # FP8 block-quant divisibility (1536/8=192, not divisible by 128) + # unless `moe-dense-tp-size: 1` runs the dense MLP layers replicated + # — and that flag is gated on `enable_dp_attention=True` in sglang + # dp_attention.py. So: DP-attention on; `moe-a2a-backend` left at + # its default `"none"` — sglang `forward_normal` path runs (verified + # in deepseek_v2.py: `_enable_a2a_moe` is False unless backend is + # deepep|mooncake|nixl|mori|ascend_fuseep|flashinfer). Filenames keep + # the historical 'dep8'/'dep16' tag for symmetry with the dsv4-fp4- + # gb200-dynamo-vllm sibling; the actual recipe is TP=8 + DP=8 with + # all-reduce/all-gather MoE dispatch. - isl: 1024 osl: 1024 search-space: - # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. + # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - conc-list: [1, 4, 8, 16, 32, 64] prefill: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" decode: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false - # Mid throughput: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. + dp-attn: true + # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - conc-list: [128, 256, 1024, 2048, 4096] prefill: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" decode: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false - # High throughput: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes. + dp-attn: true + # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. - conc-list: [4096, 8192] prefill: num-worker: 3 - tp: 4 + tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 - tp: 4 + tp: 8 ep: 1 - dp-attn: false + dp-attn: true # 8k/1k block kept commented out — same rationale as the dsv4-fp4- # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded. @@ -7740,45 +7746,45 @@ dsv4-fp4-gb200-dynamo-sglang: # - isl: 8192 # osl: 1024 # search-space: - # # Low-concurrency: 1 prefill (TP=4) + 1 decode (TP=4). 2 nodes. + # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. # - conc-list: [1, 4, 8, 16, 32, 64] # prefill: # num-worker: 1 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false + # dp-attn: true # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" # decode: # num-worker: 1 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false - # # Mid: 3 prefills (TP=4) + 1 decode (TP=4). 4 nodes. + # dp-attn: true + # # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. # - conc-list: [512, 1024] # prefill: # num-worker: 3 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false + # dp-attn: true # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" # decode: # num-worker: 1 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false - # # Max throughput: 7 prefills (TP=4) + 1 decode (TP=4). 8 nodes. + # dp-attn: true + # # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes. # - conc-list: [4096, 8192] # prefill: # num-worker: 7 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false + # dp-attn: true # additional-settings: # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" # decode: # num-worker: 1 - # tp: 4 + # tp: 8 # ep: 1 - # dp-attn: false + # dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 2833331d1..36a70076d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 2 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -66,7 +66,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,7 +85,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 8b9603422..e4a530f2a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -10,21 +10,32 @@ name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" # Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross- # framework numbers stay directly comparable. # -# Topology: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. Targets very -# low concurrency (1-64). +# Topology: 1 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 4 nodes. +# Targets very low concurrency (1-64). # -# Why TP-only (no DeepEP, no DP-attention, no EP): the -# lmsysorg/sglang:deepseek-v4-grace-blackwell image's sglang fork ships -# a fork-only quant kernel `mxfp4_deepseek.py` (does not exist in -# upstream sgl-project/sglang) that reads `dispatch_output.topk_output` -# at line 347. Neither `DeepEPNormalDispatchOutput` nor -# `DeepEPLLDispatchOutput` exposes that field in this fork, so any -# `forward_deepep` path in disagg crashes the prefill scheduler. PR #75 -# (the only upstream DSV4 sglang disagg recipe) takes the same TP-only -# stance — defaults to `moe_a2a_backend="none"` (sglang server_args.py) -# and lets TP shard the expert weights instead of sharding via EP. -# We can re-introduce EP/DeepEP once the fork's mxfp4_deepseek dispatch -# API mismatch is fixed. +# Why TP=8 + DP-attention but NO `moe-a2a-backend` (default "none"): +# 1. DSV4-Pro at MXFP4 is too large for TP=4 single-node — OOM. +# TP=8 across 2 GB200 nodes (8 GPUs * 96 GB = 768 GB) fits. +# 2. The lmsysorg/sglang:deepseek-v4-grace-blackwell sglang fork +# ships a fork-only quant kernel `mxfp4_deepseek.py` that reads +# `dispatch_output.topk_output`. Neither `DeepEPLLDispatchOutput` +# nor `DeepEPNormalDispatchOutput` exposes that field in this +# fork, so `forward_deepep` always crashes the prefill scheduler. +# We must stay off the DeepEP path. +# 3. At TP=8 the shared-experts gate_up_proj fails FP8 block-quant +# divisibility (1536/8=192, not divisible by block_n=128). +# `moe-dense-tp-size: 1` runs the dense MLP layers replicated +# (TP=1) so the divisibility check passes — but that flag is +# gated on `enable_dp_attention=True` in sglang +# `python/sglang/srt/layers/dp_attention.py` +# (`compute_dp_attention_local_info` returns the full `tp_size` +# and ignores `moe_dense_tp_size` when DP-attn is off). +# So: `enable-dp-attention: true` + `dp-size: 8` (DP-attn active so +# `moe-dense-tp-size: 1` takes effect) AND no `moe-a2a-backend` set. +# The default `"none"` lands the MoE on `forward_normal` instead of +# `forward_deepep` — verified in deepseek_v2.py: +# `_enable_a2a_moe = is_deepep|is_mooncake|is_nixl|is_mori| +# is_ascend_fuseep|is_flashinfer` → False with default. model: path: "deepseek-v4-pro" @@ -54,12 +65,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 2 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -95,7 +106,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -111,7 +125,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 3115a0317..b37023e88 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -29,12 +29,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 3 - decode_nodes: 1 + prefill_nodes: 6 + decode_nodes: 2 prefill_workers: 3 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -67,7 +67,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -83,7 +86,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index dd09ba086..2d202d337 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 1 - decode_nodes: 1 + prefill_nodes: 2 + decode_nodes: 2 prefill_workers: 1 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -66,7 +66,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,7 +85,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 5a4bf4927..a901098a4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -27,12 +27,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 3 - decode_nodes: 1 + prefill_nodes: 6 + decode_nodes: 2 prefill_workers: 3 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -65,7 +65,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -81,7 +84,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index b17d5e08f..f17bd7e2f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -28,12 +28,12 @@ health_check: resources: gpu_type: "gb200" gpus_per_node: 4 - prefill_nodes: 7 - decode_nodes: 1 + prefill_nodes: 14 + decode_nodes: 2 prefill_workers: 7 decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 4 + gpus_per_prefill: 8 + gpus_per_decode: 8 frontend: type: dynamo @@ -66,7 +66,10 @@ backend: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -82,7 +85,10 @@ backend: decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" trust-remote-code: true - tensor-parallel-size: 4 + tensor-parallel-size: 8 + moe-dense-tp-size: 1 + enable-dp-attention: true + dp-size: 8 moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 5b6eb2f36274103891cad70218c3af0940fc747b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 09:19:45 -0700 Subject: [PATCH 13/56] =?UTF-8?q?Scope=20sweep=20to=20high-conc=20DeepEP?= =?UTF-8?q?=20only=20=E2=80=94=20temporarily=20comment=201p1d=20blocks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comment out the low-conc (1-64) and mid-conc (128-4096) search-space entries in nvidia-master.yaml so the sweep iterates only on the high- conc 3p1d-dep8-dep16 topology. Re-enable DeepEP on that one recipe to exercise the EP path: 3p1d-dep8-dep16 prefill+decode: + ep-size: 8 + moe-a2a-backend: "deepep" + deepep-mode: low_latency (kept enable-dp-attention + moe-dense-tp-size: 1 + tp=8 / dp=8) Master matrix label updated to ep=8 to reflect the recipe. Sibling 1p1d recipes on disk are unchanged (still TP=8 + DP-attn, no DeepEP). They are still referenced by the commented-out master entries — restore them by uncommenting. --- .github/configs/nvidia-master.yaml | 68 ++++++++++--------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 27 ++++++-- 2 files changed, 56 insertions(+), 39 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 272f32702..87a810072 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7696,48 +7696,52 @@ dsv4-fp4-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - - conc-list: [1, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - - conc-list: [128, 256, 1024, 2048, 4096] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. + # Low-/mid-conc blocks temporarily commented out so the sweep + # exercises only the high-conc DeepEP topology below — uncomment + # to re-enable. + # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + # - conc-list: [1, 4, 8, 16, 32, 64] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + # - conc-list: [128, 256, 1024, 2048, 4096] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) + # via DeepEP. 8 nodes. matrix label ep=8 reflects the recipe's + # ep-size: 8 + moe-a2a-backend: deepep. - conc-list: [4096, 8192] prefill: num-worker: 3 tp: 8 - ep: 1 + ep: 8 dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 8 - ep: 1 + ep: 8 dp-attn: true # 8k/1k block kept commented out — same rationale as the dsv4-fp4- diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index b37023e88..be872d48f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,13 +1,20 @@ name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" -# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the -# upstream-reference list. Topology mirrors the dsv4-fp4-gb200-dynamo- -# vllm sibling. +# High-concurrency 4096/8192 topology — the only block left active in +# nvidia-master.yaml right now while we iterate on the DeepEP path. +# Sibling 1p1d recipes are kept on disk but their master.yaml entries +# are temporarily commented out. # -# Topology: 3 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 10 nodes. -# Sized for conc 4096-8192 — at those concurrencies a single prefill -# worker (the 1p1d-dep8-dep16 sibling) becomes the bottleneck since the -# 1k prefill arrival rate exceeds what one DP=8 worker can sustain. +# This recipe DOES enable DeepEP (moe-a2a-backend: deepep, ep-size: 8, +# deepep-mode: low_latency). The two 1p1d siblings stay on the +# `forward_normal` (none) backend. With the lmsysorg/sglang:deepseek- +# v4-grace-blackwell fork's `mxfp4_deepseek` bug still present (see +# ./disagg-gb200-1p1d-dep8-tep8.yaml header), this run is expected to +# either crash in the same way or surface new behaviour — the goal is +# to capture the failure mode under the actual disagg/EP topology. +# +# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 / +# EP=8). 8 nodes. Sized for conc 4096-8192. model: path: "deepseek-v4-pro" @@ -71,6 +78,9 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 + ep-size: 8 + moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -90,6 +100,9 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 + ep-size: 8 + moe-a2a-backend: "deepep" + deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From b9135868d783e67c841edcff8cb64e05d5326615 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 09:22:43 -0700 Subject: [PATCH 14/56] tep fix + dep for high conc --- .github/configs/nvidia-master.yaml | 32 ++++++++++++++---------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 87a810072..c886172ea 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7696,23 +7696,21 @@ dsv4-fp4-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - # Low-/mid-conc blocks temporarily commented out so the sweep - # exercises only the high-conc DeepEP topology below — uncomment - # to re-enable. - # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - # - conc-list: [1, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true + # Mid-conc block temporarily commented out — uncomment to re-enable. + # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. # - conc-list: [128, 256, 1024, 2048, 4096] # prefill: From bca99eb5b539e68e36b2ed4038fc9bd9a4826190 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 09:45:19 -0700 Subject: [PATCH 15/56] sike no dpa --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c886172ea..1650385a2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7710,7 +7710,7 @@ dsv4-fp4-gb200-dynamo-sglang: num-worker: 1 tp: 8 ep: 1 - dp-attn: false + dp-attn: true # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. # - conc-list: [128, 256, 1024, 2048, 4096] # prefill: From 5866658855a762dc2da9317c74e5c8f5034c676a Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 10:13:15 -0700 Subject: [PATCH 16/56] =?UTF-8?q?Cap=20SGLANG=5FDEEPEP=5FNUM=5FMAX=5FDISPA?= =?UTF-8?q?TCH=5FTOKENS=5FPER=5FRANK=20at=201024=20=E2=80=94=20sglang=20LL?= =?UTF-8?q?=20hard=20ceiling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepEP run (3p1d-dep8-dep16) crashed at: File '.../sglang/srt/layers/moe/token_dispatcher/deepep.py', line 325 assert self.num_max_dispatch_tokens_per_rank <= 1024 AssertionError _DeepEPDispatcherImplLowLatency enforces a hard upper bound of 1024 in low_latency mode. We had bumped the env var to 2048 to give headroom above the earlier C++ side cap (deep_ep.cpp:1233 'x.size(0) <= num_max_dispatch_tokens_per_rank'), but 2048 trips this Python-side assertion at scheduler init. 1024 is the exactly-allowed value: high enough to cover the cuda-graph-max-bs we use, low enough to satisfy the LL dispatcher constructor. Apply 2048 → 1024 across all 6 recipes (every prefill + decode env block). --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 4 ++-- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 4 ++-- .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 4 ++-- .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 4 ++-- .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 4 ++-- .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 36a70076d..9b773b346 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,7 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index e4a530f2a..c8bcc16a1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -90,7 +90,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -100,7 +100,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index be872d48f..a84417a16 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -58,7 +58,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -68,7 +68,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 2d202d337..267e69dd5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,7 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index a901098a4..0bbf14313 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -49,7 +49,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -59,7 +59,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index f17bd7e2f..436c3b4aa 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -50,7 +50,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" decode_environment: PYTHONUNBUFFERED: "1" @@ -60,7 +60,7 @@ backend: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "2048" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" sglang_config: prefill: From c0fc3bbe0d2908940343fbd6e7676359c9e51966 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 11:52:47 -0700 Subject: [PATCH 17/56] Revert 3p1d-dep8-dep16 to no-DeepEP TP-only; uncomment full 1k/1k + 8k/1k sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepEP is broken on the lmsysorg/sglang:deepseek-v4-grace-blackwell image — verified across three runs (deepep-mode auto/normal, deepep-mode low_latency, and the latest 3p1d try). All hit the fork-only mxfp4_deepseek.py:347 reading dispatch_output.topk_output, which neither DeepEPLLDispatchOutput nor DeepEPNormalDispatchOutput exposes in this fork. Cannot be fixed from the recipe — needs the image rebuilt with mxfp4_deepseek patched, or an upstream sglang fix. 3p1d-dep8-dep16 recipe: drop ep-size, moe-a2a-backend, deepep-mode from prefill+decode. Now matches the 1p1d siblings: TP=8 + DP=8 + moe-dense-tp-size: 1, default 'none' a2a backend (forward_normal path bypasses the buggy mxfp4_deepseek kernel). nvidia-master.yaml: * Uncomment the 1k/1k mid-conc and 8k/1k blocks (low + mid + high). * 3p1d-dep8-dep16 matrix label ep: 8 → ep: 1 to match recipe. Sweep now expands to 6 entries / 27 conc points (3 1k/1k + 3 8k/1k). --- .github/configs/nvidia-master.yaml | 131 +++++++++--------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 29 ++-- 2 files changed, 72 insertions(+), 88 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6123d7e6e..30491567f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7720,7 +7720,6 @@ dsv4-fp4-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - # Mid-conc block temporarily commented out — uncomment to re-enable. # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - conc-list: [1, 4, 8, 16, 32, 64] prefill: @@ -7735,82 +7734,78 @@ dsv4-fp4-gb200-dynamo-sglang: tp: 8 ep: 1 dp-attn: true - # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - # - conc-list: [128, 256, 1024, 2048, 4096] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) - # via DeepEP. 8 nodes. matrix label ep=8 reflects the recipe's - # ep-size: 8 + moe-a2a-backend: deepep. + # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [128, 256, 1024, 2048, 4096] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. + # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. - conc-list: [4096, 8192] prefill: num-worker: 3 tp: 8 - ep: 8 + ep: 1 dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: true - # 8k/1k block kept commented out — same rationale as the dsv4-fp4- - # gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded. - # Uncomment to re-enable (recipes are already in place). - # - isl: 8192 - # osl: 1024 - # search-space: - # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - # - conc-list: [1, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # - conc-list: [512, 1024] - # prefill: - # num-worker: 3 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes. - # - conc-list: [4096, 8192] - # prefill: - # num-worker: 7 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. + - conc-list: [512, 1024] + prefill: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true + # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes. + - conc-list: [4096, 8192] + prefill: + num-worker: 7 + tp: 8 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index a84417a16..0548de9ff 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,20 +1,15 @@ name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" -# High-concurrency 4096/8192 topology — the only block left active in -# nvidia-master.yaml right now while we iterate on the DeepEP path. -# Sibling 1p1d recipes are kept on disk but their master.yaml entries -# are temporarily commented out. +# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no +# DeepEP shape as the 1p1d siblings — see +# ./disagg-gb200-1p1d-dep8-tep8.yaml header for the full constraint +# chain (mxfp4_deepseek fork-bug → no DeepEP; FP8 block-quant → need +# moe-dense-tp-size: 1; that flag → needs DP-attention; default `none` +# moe-a2a-backend → forward_normal path bypasses the buggy kernel). +# Adds prefill capacity (3 workers vs 1) for the high-conc tail — +# single prefill saturates around conc 4096 at 1k prompts. # -# This recipe DOES enable DeepEP (moe-a2a-backend: deepep, ep-size: 8, -# deepep-mode: low_latency). The two 1p1d siblings stay on the -# `forward_normal` (none) backend. With the lmsysorg/sglang:deepseek- -# v4-grace-blackwell fork's `mxfp4_deepseek` bug still present (see -# ./disagg-gb200-1p1d-dep8-tep8.yaml header), this run is expected to -# either crash in the same way or surface new behaviour — the goal is -# to capture the failure mode under the actual disagg/EP topology. -# -# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 / -# EP=8). 8 nodes. Sized for conc 4096-8192. +# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes. model: path: "deepseek-v4-pro" @@ -78,9 +73,6 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 - ep-size: 8 - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -100,9 +92,6 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 - ep-size: 8 - moe-a2a-backend: "deepep" - deepep-mode: low_latency moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From bc9fccf49bdaaf4c75f028ae7b58e772c618e079 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Sun, 26 Apr 2026 21:57:32 -0700 Subject: [PATCH 18/56] Try moe-a2a-backend: flashinfer on 3p1d-dep8-dep16 for high-conc EP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepEP is dead in this image (mxfp4_deepseek.py:347 reads dispatch_output.topk_output, neither DeepEPNormal nor DeepEPLL output exposes that field). Smoke test the only other plausible EP backend upstream sglang offers: flashinfer. Per upstream docs/advanced_features/expert_parallelism.md, flashinfer is the documented option for 'Large-scale EP deployments' and uses a different dispatcher than DeepEP — its output class may or may not trip the same mxfp4_deepseek bug. Per server_args.py _handle_a2a_moe, flashinfer auto-sets SGLANG_MOE_NVFP4_DISPATCH=True and forces ep_size = tp_size, so we set ep-size: 8 explicitly. Everything else (TP=8 / DP=8 / moe-dense-tp-size: 1) stays so the FP8 block-quant path remains valid. Scope: 1k/1k 3p1d-dep8-dep16 only. If the EP path serves on this image, port back to the 1p1d siblings; if it crashes the same way DeepEP did, revert to the no-EP forward_normal path and accept the TP-only pareto. nvidia-master.yaml matrix labels for the 3p1d entry updated to ep=8 to match the recipe. --- .github/configs/nvidia-master.yaml | 9 +++-- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 36 ++++++++++++++----- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 21ed11dd1..6123bdf6d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7748,19 +7748,22 @@ dsv4-fp4-gb200-dynamo-sglang: ep: 1 dp-attn: true # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. + # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) via + # flashinfer EP smoke test (DeepEP is dead in this image — see the + # recipe header). matrix labels ep=8 reflect the recipe's + # ep-size: 8 + moe-a2a-backend: flashinfer. - conc-list: [4096, 8192] prefill: num-worker: 3 tp: 8 - ep: 1 + ep: 8 dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 8 - ep: 1 + ep: 8 dp-attn: true - isl: 8192 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 0548de9ff..e86224bca 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,15 +1,29 @@ name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" -# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no -# DeepEP shape as the 1p1d siblings — see -# ./disagg-gb200-1p1d-dep8-tep8.yaml header for the full constraint -# chain (mxfp4_deepseek fork-bug → no DeepEP; FP8 block-quant → need -# moe-dense-tp-size: 1; that flag → needs DP-attention; default `none` -# moe-a2a-backend → forward_normal path bypasses the buggy kernel). -# Adds prefill capacity (3 workers vs 1) for the high-conc tail — -# single prefill saturates around conc 4096 at 1k prompts. +# High-concurrency 4096/8192 topology — flashinfer EP smoke test. # -# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes. +# DeepEP is dead on this image (mxfp4_deepseek.py:347 reads +# dispatch_output.topk_output, neither DeepEPNormal nor DeepEPLL output +# class exposes that field — see ./disagg-gb200-1p1d-dep8-tep8.yaml +# header). This recipe tries `moe-a2a-backend: flashinfer` instead — +# upstream sglang docs (docs/advanced_features/expert_parallelism.md) +# call out flashinfer as the option for "Large-scale EP deployments", +# and its dispatcher returns a different output class than DeepEP, so +# the mxfp4_deepseek apply path may or may not trip the same bug. +# +# Per sglang server_args.py `_handle_a2a_moe`, flashinfer auto-sets +# SGLANG_MOE_NVFP4_DISPATCH=True and forces ep_size = tp_size, so we +# set ep-size: 8 explicitly. Keeps everything else (TP=8 / DP=8 / +# moe-dense-tp-size: 1) so the FP8 block-quant path remains valid. +# +# Goal here is binary: does the EP path serve any real prefill batch +# on this image, or does it crash the same way DeepEP did. If it +# serves, copy this pattern back to the 1p1d siblings; if it crashes, +# revert to the no-EP forward_normal path and accept the TP-only +# pareto. +# +# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 / +# EP=8). 8 nodes. model: path: "deepseek-v4-pro" @@ -73,6 +87,8 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 + ep-size: 8 + moe-a2a-backend: "flashinfer" moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -92,6 +108,8 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 + ep-size: 8 + moe-a2a-backend: "flashinfer" moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From e6d8943c7f883904a4ea8bca774db51e6dd572cb Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 00:09:34 -0700 Subject: [PATCH 19/56] =?UTF-8?q?Revert=20flashinfer=20EP=20attempt=20?= =?UTF-8?q?=E2=80=94=20accept=20TP-only=20pareto,=20every=20EP=20backend?= =?UTF-8?q?=20dead=20on=20this=20image?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flashinfer EP smoke test (3p1d-dep8-dep16 1k/1k) crashed at startup: File '.../sglang/srt/server_args.py', line 2133, in _handle_a2a_moe assert self.moe_runner_backend in [...] AssertionError: Flashinfer MoE A2A is only supported with flashinfer_cutlass moe runner backend flashinfer_cutlass is FP8-only — won't load DSV4-Pro's MXFP4 weights. The only path that satisfies the assertion would also fail at model load. So flashinfer is unusable for DSV4 on any image that doesn't ship a flashinfer_mxfp4_cutlass runner (which doesn't exist). Combined with the earlier deepep failure (mxfp4_deepseek.py:347 AttributeError on dispatch_output.topk_output, both Normal and LL dispatch types), every EP backend sglang exposes in this image is dead. Remaining options (mooncake, nixl-ep, mori, ascend_fuseep) are either Ascend-NPU-only or not wired into this image. Revert 3p1d-dep8-dep16 recipe to no-EP TP-only (matches the 5 sibling recipes) and master.yaml matrix labels (ep: 8 → ep: 1). PR description's Known Issues section updated to a 4-row table covering every EP backend tried and accepted as dead end. --- .github/configs/nvidia-master.yaml | 9 ++-- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 41 +++++++------------ 2 files changed, 18 insertions(+), 32 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6123bdf6d..21ed11dd1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7748,22 +7748,19 @@ dsv4-fp4-gb200-dynamo-sglang: ep: 1 dp-attn: true # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # High throughput: 3 prefills (TP=8 EP=8) + 1 decode (TP=8 EP=8) via - # flashinfer EP smoke test (DeepEP is dead in this image — see the - # recipe header). matrix labels ep=8 reflect the recipe's - # ep-size: 8 + moe-a2a-backend: flashinfer. + # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. - conc-list: [4096, 8192] prefill: num-worker: 3 tp: 8 - ep: 8 + ep: 1 dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" decode: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: true - isl: 8192 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index e86224bca..96acb25f2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,29 +1,22 @@ name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" -# High-concurrency 4096/8192 topology — flashinfer EP smoke test. +# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no-EP +# shape as the 1p1d siblings — see ./disagg-gb200-1p1d-dep8-tep8.yaml +# header for the full constraint chain. # -# DeepEP is dead on this image (mxfp4_deepseek.py:347 reads -# dispatch_output.topk_output, neither DeepEPNormal nor DeepEPLL output -# class exposes that field — see ./disagg-gb200-1p1d-dep8-tep8.yaml -# header). This recipe tries `moe-a2a-backend: flashinfer` instead — -# upstream sglang docs (docs/advanced_features/expert_parallelism.md) -# call out flashinfer as the option for "Large-scale EP deployments", -# and its dispatcher returns a different output class than DeepEP, so -# the mxfp4_deepseek apply path may or may not trip the same bug. +# Both EP backends available upstream (deepep, flashinfer) are dead on +# this image: +# * deepep — mxfp4_deepseek.py:347 reads dispatch_output.topk_output; +# neither DeepEPNormalDispatchOutput nor DeepEPLLDispatchOutput +# exposes that field in this fork. +# * flashinfer — `_handle_a2a_moe` in server_args.py asserts +# "Flashinfer MoE A2A is only supported with flashinfer_cutlass +# moe runner backend", and flashinfer_cutlass is FP8-only — won't +# load DSV4-Pro's MXFP4 weights. +# Adds prefill capacity (3 workers vs 1) for the high-conc tail — +# single prefill saturates around conc 4096 at 1k prompts. # -# Per sglang server_args.py `_handle_a2a_moe`, flashinfer auto-sets -# SGLANG_MOE_NVFP4_DISPATCH=True and forces ep_size = tp_size, so we -# set ep-size: 8 explicitly. Keeps everything else (TP=8 / DP=8 / -# moe-dense-tp-size: 1) so the FP8 block-quant path remains valid. -# -# Goal here is binary: does the EP path serve any real prefill batch -# on this image, or does it crash the same way DeepEP did. If it -# serves, copy this pattern back to the 1p1d siblings; if it crashes, -# revert to the no-EP forward_normal path and accept the TP-only -# pareto. -# -# Topology: 3 prefill (TP=8 / DP=8 / EP=8) + 1 decode (TP=8 / DP=8 / -# EP=8). 8 nodes. +# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes. model: path: "deepseek-v4-pro" @@ -87,8 +80,6 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 - ep-size: 8 - moe-a2a-backend: "flashinfer" moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true @@ -108,8 +99,6 @@ backend: moe-dense-tp-size: 1 enable-dp-attention: true dp-size: 8 - ep-size: 8 - moe-a2a-backend: "flashinfer" moe-runner-backend: "flashinfer_mxfp4" chunked-prefill-size: 4096 disable-flashinfer-autotune: true From 1d27533a322c3016f27d7ddf305a0380accefd6b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 14:46:45 -0700 Subject: [PATCH 20/56] fix(sglang): bump 8k1k prefill max-running-requests from 4 to 8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sglang computes per-rank capacity as max_running_requests // dp_size. With dp-size=8, a value of 4 floors to 0, hitting the "max_running_request is zero" assertion in tp_worker.py:277. Bump to 8 so each DP rank gets at least 1 slot — matches the working 1p1d recipe. --- .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 +- .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 0bbf14313..291390321 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -75,7 +75,7 @@ backend: disable-radix-cache: true mem-fraction-static: 0.82 context-length: 9280 - max-running-requests: 4 + max-running-requests: 8 stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 436c3b4aa..e635de8f0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -76,7 +76,7 @@ backend: disable-radix-cache: true mem-fraction-static: 0.82 context-length: 9280 - max-running-requests: 4 + max-running-requests: 8 stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" From df1c783af91d2a2cfe4cbd74e839cc609ce37a4b Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Mon, 27 Apr 2026 21:05:54 -0700 Subject: [PATCH 21/56] ports --- .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 2 ++ .../sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 2 ++ 6 files changed, 12 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml index 9b773b346..d309562a1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml @@ -80,6 +80,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -100,6 +101,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml index c8bcc16a1..e20c9c0a2 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -120,6 +120,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -140,6 +141,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 96acb25f2..a8a161798 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -90,6 +90,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -110,6 +111,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml index 267e69dd5..218ad01f6 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml @@ -80,6 +80,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -100,6 +101,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 291390321..a1fd14571 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -79,6 +79,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -99,6 +100,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index e635de8f0..4eb0f2716 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -80,6 +80,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "prefill" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl decode: @@ -100,6 +101,7 @@ backend: stream-interval: 50 decode-log-interval: 1000 disaggregation-mode: "decode" + disaggregation-bootstrap-port: 30001 disaggregation-transfer-backend: nixl benchmark: From 513cbef2d45f095994e4e32a7322fcd919ecb7da Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Tue, 28 Apr 2026 12:03:07 -0700 Subject: [PATCH 22/56] Dsv4 fp4 gb200 dynamo sglang disagg (#1213) * Modify deepseek-v4 configuration for new model settings * Update YAML configuration for deepseek model --- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 246 ++++++++++++------ .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 234 ++++++++++++----- 2 files changed, 325 insertions(+), 155 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index a8a161798..6dddf8204 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,123 +1,203 @@ -name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" - -# High-concurrency 4096/8192 topology. Same TP=8 + DP-attn + no-EP -# shape as the 1p1d siblings — see ./disagg-gb200-1p1d-dep8-tep8.yaml -# header for the full constraint chain. -# -# Both EP backends available upstream (deepep, flashinfer) are dead on -# this image: -# * deepep — mxfp4_deepseek.py:347 reads dispatch_output.topk_output; -# neither DeepEPNormalDispatchOutput nor DeepEPLLDispatchOutput -# exposes that field in this fork. -# * flashinfer — `_handle_a2a_moe` in server_args.py asserts -# "Flashinfer MoE A2A is only supported with flashinfer_cutlass -# moe runner backend", and flashinfer_cutlass is FP8-only — won't -# load DSV4-Pro's MXFP4 weights. -# Adds prefill capacity (3 workers vs 1) for the high-conc tail — -# single prefill saturates around conc 4096 at 1k prompts. -# -# Topology: 3 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 8 nodes. +name: "dsv4-pro-gb300-fp4" -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" - precision: "fp4" +slurm: + partition: hpc-mid + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" -# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b - install: true + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" -slurm: - time_limit: "8:00:00" +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh -health_check: - max_attempts: 1440 - interval_seconds: 10 +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" resources: - gpu_type: "gb200" + gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 + # prefill_nodes / prefill_workers / decode_nodes / decode_workers are + # set per-override; not duplicated in base. -frontend: - type: dynamo - enable_multiple_frontends: false +extra_mount: + - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang" + - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang" + +# setup_script: "install_sglang.sh" backend: type: sglang prefill_environment: + # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" decode_environment: + # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 # pr50 sets it, let's do it + # tokenizer-worker-num: 16 # need this if we run tokenizer + + # Parallel + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 3072 - max-running-requests: 16 - stream-interval: 50 - decode-log-interval: 1000 + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + disaggregation-mode: "prefill" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + # disable-radix-cache: true # NOTE try to enable radix cache decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 3072 - max-running-requests: 1024 - cuda-graph-max-bs: 1024 - stream-interval: 50 - decode-log-interval: 1000 + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 # pr50 sets it, let's do it + # tokenizer-worker-num: 16 # need this if we run tokenizer + # disable-radix-cache: true # NOTE try to enable radix cache + disaggregation-mode: "decode" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false + disaggregation-transfer-backend: mooncake + + # tensor-parallel-size / data-parallel-size / expert-parallel-size + # / max-running-requests / cuda-graph-max-bs are set per-override. + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + + benchmark: + type: custom + command: | + set -e + REPO=/configs/upstream-sa-bench/InferenceX + [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" + cd "$REPO/utils/bench_serving" + python3 benchmark_serving.py \ + --backend sglang --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \ + --num-warmups 512 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results.json + # concurrencies set per-override + +############ 1k1k ############## +# [0]is wideep, [1] is narrow ep +zip_override_1k1k_hightpt: + resources: + prefill_nodes: [7, 1] + prefill_workers: [7, 1] + decode_nodes: [2, 2] + decode_workers: [1, 1] + backend: + sglang_config: + decode: + tensor-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + data-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + expert-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + + enable-dp-attention: true + enable-dp-lm-head: true + + # ep-num-redundant-experts + ep-dispatch-algorithm intentionally + # removed: no static dispatching file available yet. + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu + cuda-graph-max-bs: [1152, 32] + + # benchmark: + # isl: 1024 + # osl: 1024 + # concurrencies: "16384" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index 4eb0f2716..dacb0f9bd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -1,113 +1,203 @@ -name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" +name: "dsv4-pro-gb300-fp4" -# 8k/1k max-throughput topology: 7 prefill (DP=8 EP=8) + 1 wide decode -# (DP=16 EP=16). 18 nodes — full GB200 cluster. Targets conc 4096-8192. -# Per-worker tunings identical to the 3p1d sibling; only prefill_workers -# and prefill_nodes scale up. -# -# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference -# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. +slurm: + partition: hpc-mid + time_limit: "03:00:00" -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" - precision: "fp4" +sbatch_directives: + cpus-per-task: "144" + mem: "0" -# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. dynamo: - hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b - install: true + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" -slurm: - time_limit: "8:00:00" +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh -health_check: - max_attempts: 1440 - interval_seconds: 10 +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "fp4" resources: - gpu_type: "gb200" + gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 14 - decode_nodes: 2 - prefill_workers: 7 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 + # prefill_nodes / prefill_workers / decode_nodes / decode_workers are + # set per-override; not duplicated in base. -frontend: - type: dynamo - enable_multiple_frontends: false +extra_mount: + - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang" + - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang" + +# setup_script: "install_sglang.sh" backend: type: sglang prefill_environment: + # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" decode_environment: + # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 # pr50 sets it, let's do it + # tokenizer-worker-num: 16 # need this if we run tokenizer + + # Parallel + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 9280 - max-running-requests: 8 - stream-interval: 50 - decode-log-interval: 1000 + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + disaggregation-mode: "prefill" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + # disable-radix-cache: true # NOTE try to enable radix cache decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 9280 - max-running-requests: 256 - cuda-graph-max-bs: 256 - stream-interval: 50 - decode-log-interval: 1000 + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 # pr50 sets it, let's do it + # tokenizer-worker-num: 16 # need this if we run tokenizer + # disable-radix-cache: true # NOTE try to enable radix cache + disaggregation-mode: "decode" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false + disaggregation-transfer-backend: mooncake + + # tensor-parallel-size / data-parallel-size / expert-parallel-size + # / max-running-requests / cuda-graph-max-bs are set per-override. + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + + benchmark: + type: custom + command: | + set -e + REPO=/configs/upstream-sa-bench/InferenceX + [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" + cd "$REPO/utils/bench_serving" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \ + --num-warmups 512 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results.json + # concurrencies set per-override + +############ 8k1k ############## +# [0]is wideep, [1] is narrow ep +zip_override_8k1k_hightpt: + resources: + prefill_nodes: [7, 1] + prefill_workers: [7, 1] + decode_nodes: [2, 2] + decode_workers: [1, 1] + backend: + sglang_config: + decode: + tensor-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + data-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + expert-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu + + enable-dp-attention: true + enable-dp-lm-head: true + + # ep-num-redundant-experts + ep-dispatch-algorithm intentionally + # removed: no static dispatching file available yet. + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu + cuda-graph-max-bs: [1152, 32] + + # benchmark: + # isl: 8192 + # osl: 1024 + # concurrencies: "16384" From b27c8da37878535c5a1d9e092be8140d360885b5 Mon Sep 17 00:00:00 2001 From: Oseltamivir Date: Tue, 28 Apr 2026 12:36:51 -0700 Subject: [PATCH 23/56] adapt for model path, etc --- .github/configs/nvidia-master.yaml | 127 +++++++------- .../1k1k/disagg-gb200-3p1d-dep8-dep16.yaml | 162 ++++++++--------- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 163 ++++++++---------- 3 files changed, 207 insertions(+), 245 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0b43c4549..1c85aeab2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7746,36 +7746,40 @@ dsv4-fp4-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: - # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - - conc-list: [1, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - - conc-list: [128, 256, 1024, 2048, 4096] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # High throughput: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # 4096 overlap with the 1p1d block gives a prefill-scaling A/B. + # Low/mid-concurrency entries (1p1d-dep8-tep8 and 1p1d-dep8-dep16 + # recipes) commented out: PR #1213 only refreshed the 3p1d-dep8-dep16 + # high-throughput recipe; the 1p1d siblings still match the older + # operational shape and are out of scope for the PR #1213 sweep. + # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + # - conc-list: [1, 4, 8, 16, 32, 64] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + # - conc-list: [128, 256, 1024, 2048, 4096] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # High throughput: 3 prefills (TP=4 / DP=4 / EP=4) + 1 decode + # (TP=8 / DP=8 / EP=8 wideep). 5 nodes. Refreshed by PR #1213. - conc-list: [4096, 8192] prefill: num-worker: 3 @@ -7793,35 +7797,40 @@ dsv4-fp4-gb200-dynamo-sglang: - isl: 8192 osl: 1024 search-space: - # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - - conc-list: [1, 4, 8, 16, 32, 64] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - - conc-list: [512, 1024] - prefill: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: true - # Max throughput: 7 prefills (TP=8) + 1 decode (TP=8). 16 nodes. + # Low/mid-concurrency entries (1p1d-dep8-tep8 and 3p1d-dep8-dep16 + # recipes) commented out: PR #1213 only refreshed the 7p1d-dep8-dep16 + # max-throughput recipe; the 1p1d/3p1d siblings still match the older + # operational shape and are out of scope for the PR #1213 sweep. + # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. + # - conc-list: [1, 4, 8, 16, 32, 64] + # prefill: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. + # - conc-list: [512, 1024] + # prefill: + # num-worker: 3 + # tp: 8 + # ep: 1 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 1 + # dp-attn: true + # Max throughput: 7 prefills (TP=4 / DP=4 / EP=4) + 1 decode + # (TP=8 / DP=8 / EP=8 wideep). 9 nodes. Refreshed by PR #1213. - conc-list: [4096, 8192] prefill: num-worker: 7 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml index 6dddf8204..ced4e1e5b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml @@ -1,45 +1,63 @@ -name: "dsv4-pro-gb300-fp4" +name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" + +# 1k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200 +# (matrix runs on gb200-nv runners, not gb300), container & model.path +# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml +# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`), +# slurm.partition + sbatch_directives + extra_mount + nginx_container +# dropped (they reference paths/partitions that exist only on the PR +# author's gb300 cluster). -slurm: - partition: hpc-mid - time_limit: "03:00:00" - -sbatch_directives: - cpus-per-task: "144" - mem: "0" +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" +# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. +# Hash bumped from PR #1213 to track the dynamo-sglang dsv4 dev branch. dynamo: hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 8 - nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh +slurm: + time_limit: "8:00:00" -model: - path: "dsv4-pro" - container: "dsv4-grace-blackwell" - precision: "fp4" +health_check: + max_attempts: 1440 + interval_seconds: 10 +# Topology: 3 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 5 nodes total. resources: - gpu_type: "gb300" + gpu_type: "gb200" gpus_per_node: 4 - # prefill_nodes / prefill_workers / decode_nodes / decode_workers are - # set per-override; not duplicated in base. - -extra_mount: - - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang" - - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang" + prefill_nodes: 3 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 -# setup_script: "install_sglang.sh" +frontend: + type: dynamo + enable_multiple_frontends: false backend: type: sglang prefill_environment: - # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug - SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" SGLANG_ENABLE_THINKING: "1" @@ -70,8 +88,7 @@ backend: SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" decode_environment: - # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug - SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" SGLANG_ENABLE_THINKING: "1" @@ -105,14 +122,11 @@ backend: sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" trust-remote-code: true watchdog-timeout: 86400 skip-tokenizer-init: true - stream-interval: 30 # pr50 sets it, let's do it - # tokenizer-worker-num: 16 # need this if we run tokenizer + stream-interval: 30 - # Parallel tensor-parallel-size: 4 data-parallel-size: 4 expert-parallel-size: 4 @@ -128,76 +142,38 @@ backend: max-running-requests: 512 cuda-graph-max-bs: 512 chunked-prefill-size: 32768 - # disable-radix-cache: true # NOTE try to enable radix cache decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" trust-remote-code: true watchdog-timeout: 86400 skip-tokenizer-init: true - stream-interval: 30 # pr50 sets it, let's do it - # tokenizer-worker-num: 16 # need this if we run tokenizer - # disable-radix-cache: true # NOTE try to enable radix cache + stream-interval: 30 + + # Wideep decode shape (zip_override [0] from PR #1213, inlined). + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake - # tensor-parallel-size / data-parallel-size / expert-parallel-size - # / max-running-requests / cuda-graph-max-bs are set per-override. - mem-fraction-static: 0.94 swa-full-tokens-ratio: 0.15 context-length: 16384 - - benchmark: - type: custom - command: | - set -e - REPO=/configs/upstream-sa-bench/InferenceX - [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" - cd "$REPO/utils/bench_serving" - python3 benchmark_serving.py \ - --backend sglang --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ - --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ - --dataset-name random \ - --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ - --random-num-workers 96 \ - --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \ - --num-warmups 512 \ - --ignore-eos --trust-remote-code \ - --percentile-metrics ttft,tpot,itl,e2el \ - --save-result --result-dir /logs --result-filename results.json - # concurrencies set per-override - -############ 1k1k ############## -# [0]is wideep, [1] is narrow ep -zip_override_1k1k_hightpt: - resources: - prefill_nodes: [7, 1] - prefill_workers: [7, 1] - decode_nodes: [2, 2] - decode_workers: [1, 1] - backend: - sglang_config: - decode: - tensor-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu - data-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu - expert-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu - - enable-dp-attention: true - enable-dp-lm-head: true - - # ep-num-redundant-experts + ep-dispatch-algorithm intentionally - # removed: no static dispatching file available yet. - - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - - max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu - cuda-graph-max-bs: [1152, 32] - - # benchmark: - # isl: 1024 - # osl: 1024 - # concurrencies: "16384" + max-running-requests: 9216 + cuda-graph-max-bs: 1152 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x8192" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml index dacb0f9bd..3a72d70f8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml @@ -1,45 +1,64 @@ -name: "dsv4-pro-gb300-fp4" +name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200 +# (matrix runs on gb200-nv runners, not gb300), container & model.path +# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml +# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`), +# slurm.partition + sbatch_directives + extra_mount + nginx_container +# dropped (they reference paths/partitions that exist only on the PR +# author's gb300 cluster). -slurm: - partition: hpc-mid - time_limit: "03:00:00" - -sbatch_directives: - cpus-per-task: "144" - mem: "0" +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. dynamo: hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 8 - nginx_container: /mnt/home/yangminl/containers/nginx-1.27.4.sqsh +slurm: + time_limit: "8:00:00" -model: - path: "dsv4-pro" - container: "dsv4-grace-blackwell" - precision: "fp4" +health_check: + max_attempts: 1440 + interval_seconds: 10 +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. resources: - gpu_type: "gb300" + gpu_type: "gb200" gpus_per_node: 4 - # prefill_nodes / prefill_workers / decode_nodes / decode_workers are - # set per-override; not duplicated in base. - -extra_mount: - - "/mnt/home/yangminl/sglang-patched/sglang:/sgl-workspace/sglang" - - "/mnt/home/yangminl/sglang-patched/sglang:/workspace/sglang" + prefill_nodes: 7 + decode_nodes: 2 + prefill_workers: 7 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 -# setup_script: "install_sglang.sh" +frontend: + type: dynamo + enable_multiple_frontends: false backend: type: sglang prefill_environment: - # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug - SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" SGLANG_ENABLE_THINKING: "1" @@ -70,8 +89,7 @@ backend: SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" decode_environment: - # SGLANG_HACK_PRINT_REQ_LIFECYCLE: "1" # TODO temp debug - SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" # NOTE hack for quick tests + SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" PYTHONUNBUFFERED: "1" SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" SGLANG_ENABLE_THINKING: "1" @@ -105,14 +123,11 @@ backend: sglang_config: prefill: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" trust-remote-code: true watchdog-timeout: 86400 skip-tokenizer-init: true - stream-interval: 30 # pr50 sets it, let's do it - # tokenizer-worker-num: 16 # need this if we run tokenizer + stream-interval: 30 - # Parallel tensor-parallel-size: 4 data-parallel-size: 4 expert-parallel-size: 4 @@ -128,76 +143,38 @@ backend: max-running-requests: 512 cuda-graph-max-bs: 512 chunked-prefill-size: 32768 - # disable-radix-cache: true # NOTE try to enable radix cache decode: served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" trust-remote-code: true watchdog-timeout: 86400 skip-tokenizer-init: true - stream-interval: 30 # pr50 sets it, let's do it - # tokenizer-worker-num: 16 # need this if we run tokenizer - # disable-radix-cache: true # NOTE try to enable radix cache + stream-interval: 30 + + # Wideep decode shape (zip_override [0] from PR #1213, inlined). + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake - # tensor-parallel-size / data-parallel-size / expert-parallel-size - # / max-running-requests / cuda-graph-max-bs are set per-override. - mem-fraction-static: 0.94 swa-full-tokens-ratio: 0.15 context-length: 16384 - - benchmark: - type: custom - command: | - set -e - REPO=/configs/upstream-sa-bench/InferenceX - [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" - cd "$REPO/utils/bench_serving" - python3 benchmark_serving.py \ - --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ - --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ - --dataset-name random \ - --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 \ - --random-num-workers 96 \ - --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \ - --num-warmups 512 \ - --ignore-eos --trust-remote-code \ - --percentile-metrics ttft,tpot,itl,e2el \ - --save-result --result-dir /logs --result-filename results.json - # concurrencies set per-override - -############ 8k1k ############## -# [0]is wideep, [1] is narrow ep -zip_override_8k1k_hightpt: - resources: - prefill_nodes: [7, 1] - prefill_workers: [7, 1] - decode_nodes: [2, 2] - decode_workers: [1, 1] - backend: - sglang_config: - decode: - tensor-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu - data-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu - expert-parallel-size: [8, 8] # NOTE change from 16gpu to 8gpu - - enable-dp-attention: true - enable-dp-lm-head: true - - # ep-num-redundant-experts + ep-dispatch-algorithm intentionally - # removed: no static dispatching file available yet. - - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - - max-running-requests: [9216, 256] # NOTE change from 16gpu to 8gpu - cuda-graph-max-bs: [1152, 32] - - # benchmark: - # isl: 8192 - # osl: 1024 - # concurrencies: "16384" + max-running-requests: 9216 + cuda-graph-max-bs: 1152 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096x8192" + req_rate: "inf" + use_chat_template: false From 0dbc9a484bdcb0ee27d51b280fe54157b9526889 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 13:26:03 -0700 Subject: [PATCH 24/56] dev --- .../{1k1k => 1k1k-stale}/disagg-gb200-1p1d-dep8-dep16.yaml | 0 .../{1k1k => 1k1k-stale}/disagg-gb200-1p1d-dep8-tep8.yaml | 0 .../{1k1k => 1k1k-stale}/disagg-gb200-3p1d-dep8-dep16.yaml | 0 ...1p1d-dep8-tep8.yaml => stale-disagg-gb200-1p1d-dep8-tep8.yaml} | 0 ...1d-dep8-dep16.yaml => stale-disagg-gb200-3p1d-dep8-dep16.yaml} | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/{1k1k => 1k1k-stale}/disagg-gb200-1p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/{1k1k => 1k1k-stale}/disagg-gb200-1p1d-dep8-tep8.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/{1k1k => 1k1k-stale}/disagg-gb200-3p1d-dep8-dep16.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-gb200-1p1d-dep8-tep8.yaml => stale-disagg-gb200-1p1d-dep8-tep8.yaml} (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-gb200-3p1d-dep8-dep16.yaml => stale-disagg-gb200-3p1d-dep8-dep16.yaml} (100%) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml From ba72558eca41d413129b347140cbb17644996320 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 13:44:10 -0700 Subject: [PATCH 25/56] upd --- .../8k1k/disagg-gb300-2p1d-dep4-dep8.yaml | 178 ++++++++++++++++++ .../8k1k/disagg-gb300-7p1d-dep4-dep8.yaml | 178 ++++++++++++++++++ 2 files changed, 356 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml new file mode 100644 index 000000000..bceffd528 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml @@ -0,0 +1,178 @@ +name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200 +# (matrix runs on gb200-nv runners, not gb300), container & model.path +# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml +# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`), +# slurm.partition + sbatch_directives + extra_mount + nginx_container +# dropped (they reference paths/partitions that exist only on the PR +# author's gb300 cluster). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +# Topology: 2 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 3 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 + + # Wideep decode shape (zip_override [0] from PR #1213, inlined). + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + max-running-requests: 9216 + cuda-graph-max-bs: 1152 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml new file mode 100644 index 000000000..731adeb13 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml @@ -0,0 +1,178 @@ +name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200 +# (matrix runs on gb200-nv runners, not gb300), container & model.path +# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml +# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`), +# slurm.partition + sbatch_directives + extra_mount + nginx_container +# dropped (they reference paths/partitions that exist only on the PR +# author's gb300 cluster). + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "8:00:00" + +health_check: + max_attempts: 1440 + interval_seconds: 10 + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 7 + decode_nodes: 2 + prefill_workers: 7 + decode_workers: 1 + gpus_per_prefill: 4 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 30 + + # Wideep decode shape (zip_override [0] from PR #1213, inlined). + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + max-running-requests: 9216 + cuda-graph-max-bs: 1152 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "8192" + req_rate: "inf" + use_chat_template: false From 7c81fe95d0cff00d439cbc2550dc867614bf9216 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 14:25:25 -0700 Subject: [PATCH 26/56] fix --- .github/configs/nvidia-master.yaml | 105 ++-------- .../disagg-gb200-1p1d-dep8-dep16.yaml | 113 ----------- .../disagg-gb200-1p1d-dep8-tep8.yaml | 153 --------------- .../disagg-gb200-3p1d-dep8-dep16.yaml | 179 ----------------- .../8k1k/disagg-gb200-7p1d-dep8-dep16.yaml | 180 ------------------ .../stale-disagg-gb200-1p1d-dep8-tep8.yaml | 113 ----------- .../stale-disagg-gb200-3p1d-dep8-dep16.yaml | 112 ----------- 7 files changed, 19 insertions(+), 936 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1c85aeab2..aff5524b3 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7717,38 +7717,22 @@ dsv4-fp4-gb200-dynamo-vllm: ep: 16 dp-attn: true -dsv4-fp4-gb200-dynamo-sglang: +dsv4-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:deepseek-v4-grace-blackwell model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: gb200 + runner: gb300 precision: fp4 framework: dynamo-sglang multinode: true disagg: true seq-len-configs: - # 1k/1k — TP=8 (2 GB200 nodes per worker) with DP-attention but no - # DeepEP. The lmsysorg/sglang:deepseek-v4-grace-blackwell image's - # sglang fork has a fork-only mxfp4_deepseek kernel that crashes any - # DeepEP forward path (both DeepEPLLDispatchOutput and - # DeepEPNormalDispatchOutput lack the `topk_output` field the kernel - # reads). At TP=8 the shared-experts gate_up_proj would also fail - # FP8 block-quant divisibility (1536/8=192, not divisible by 128) - # unless `moe-dense-tp-size: 1` runs the dense MLP layers replicated - # — and that flag is gated on `enable_dp_attention=True` in sglang - # dp_attention.py. So: DP-attention on; `moe-a2a-backend` left at - # its default `"none"` — sglang `forward_normal` path runs (verified - # in deepseek_v2.py: `_enable_a2a_moe` is False unless backend is - # deepep|mooncake|nixl|mori|ascend_fuseep|flashinfer). Filenames keep - # the historical 'dep8'/'dep16' tag for symmetry with the dsv4-fp4- - # gb200-dynamo-vllm sibling; the actual recipe is TP=8 + DP=8 with - # all-reduce/all-gather MoE dispatch. - - isl: 1024 + - isl: 8192 osl: 1024 search-space: - # Low/mid-concurrency entries (1p1d-dep8-tep8 and 1p1d-dep8-dep16 - # recipes) commented out: PR #1213 only refreshed the 3p1d-dep8-dep16 - # high-throughput recipe; the 1p1d siblings still match the older + # Low/mid-concurrency entries (1p1d-dep8-tep8 and 3p1d-dep8-dep16 + # recipes) commented out: PR #1213 only refreshed the 7p1d-dep8-dep16 + # max-throughput recipe; the 1p1d/3p1d siblings still match the older # operational shape and are out of scope for the PR #1213 sweep. # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. # - conc-list: [1, 4, 8, 16, 32, 64] @@ -7758,89 +7742,38 @@ dsv4-fp4-gb200-dynamo-sglang: # ep: 1 # dp-attn: true # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # # Mid throughput: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - # - conc-list: [128, 256, 1024, 2048, 4096] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml" + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" # decode: # num-worker: 1 # tp: 8 # ep: 1 # dp-attn: true - # High throughput: 3 prefills (TP=4 / DP=4 / EP=4) + 1 decode - # (TP=8 / DP=8 / EP=8 wideep). 5 nodes. Refreshed by PR #1213. - - conc-list: [4096, 8192] + # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. + - conc-list: [64] prefill: - num-worker: 3 - tp: 8 - ep: 1 + num-worker: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-2p1d-dep4-dep8.yaml" decode: num-worker: 1 tp: 8 - ep: 1 + ep: 8 dp-attn: true - - - isl: 8192 - osl: 1024 - search-space: - # Low/mid-concurrency entries (1p1d-dep8-tep8 and 3p1d-dep8-dep16 - # recipes) commented out: PR #1213 only refreshed the 7p1d-dep8-dep16 - # max-throughput recipe; the 1p1d/3p1d siblings still match the older - # operational shape and are out of scope for the PR #1213 sweep. - # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - # - conc-list: [1, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # - conc-list: [512, 1024] - # prefill: - # num-worker: 3 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true # Max throughput: 7 prefills (TP=4 / DP=4 / EP=4) + 1 decode # (TP=8 / DP=8 / EP=8 wideep). 9 nodes. Refreshed by PR #1213. - - conc-list: [4096, 8192] + - conc-list: [8192] prefill: num-worker: 7 - tp: 8 - ep: 1 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml" decode: num-worker: 1 tp: 8 - ep: 1 + ep: 8 dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml deleted file mode 100644 index d309562a1..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-dep16.yaml +++ /dev/null @@ -1,113 +0,0 @@ -name: "dsv4-sglang-disagg-gb200-1p1d-dep8-dep16" - -# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the -# upstream-reference list (PR #69 GB200 agg, PR #75 GB300 disagg). -# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. -# -# Topology: 1 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 6 nodes. -# Single prefill is enough for 1k prompts up to ~conc 4096 (per-rank -# prefill TFlops at 1k ISL is high; matches the vLLM sibling sizing). - -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" - precision: "fp4" - -# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. -dynamo: - hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b - install: true - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: sglang - - prefill_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" - - decode_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 3072 - max-running-requests: 16 - stream-interval: 50 - decode-log-interval: 1000 - disaggregation-mode: "prefill" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 3072 - max-running-requests: 512 - cuda-graph-max-bs: 512 - stream-interval: 50 - decode-log-interval: 1000 - disaggregation-mode: "decode" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "128x256x1024x2048x4096" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml deleted file mode 100644 index e20c9c0a2..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-1p1d-dep8-tep8.yaml +++ /dev/null @@ -1,153 +0,0 @@ -name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" - -# Hand-rolled — no GB200 DSV4 sglang disagg recipe exists upstream. The -# closest references on NVIDIA/srt-slurm are: -# * PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml) — -# GB200 DSV4 sglang AGGREGATED: per-worker flag set + env vars. -# * PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml) — -# GB300 DSV4 sglang DISAGG: confirms nixl + flashinfer_mxfp4 + -# chunked-prefill-size=4096 + disable-flashinfer-autotune. -# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross- -# framework numbers stay directly comparable. -# -# Topology: 1 prefill (TP=8 / DP=8) + 1 decode (TP=8 / DP=8). 4 nodes. -# Targets very low concurrency (1-64). -# -# Why TP=8 + DP-attention but NO `moe-a2a-backend` (default "none"): -# 1. DSV4-Pro at MXFP4 is too large for TP=4 single-node — OOM. -# TP=8 across 2 GB200 nodes (8 GPUs * 96 GB = 768 GB) fits. -# 2. The lmsysorg/sglang:deepseek-v4-grace-blackwell sglang fork -# ships a fork-only quant kernel `mxfp4_deepseek.py` that reads -# `dispatch_output.topk_output`. Neither `DeepEPLLDispatchOutput` -# nor `DeepEPNormalDispatchOutput` exposes that field in this -# fork, so `forward_deepep` always crashes the prefill scheduler. -# We must stay off the DeepEP path. -# 3. At TP=8 the shared-experts gate_up_proj fails FP8 block-quant -# divisibility (1536/8=192, not divisible by block_n=128). -# `moe-dense-tp-size: 1` runs the dense MLP layers replicated -# (TP=1) so the divisibility check passes — but that flag is -# gated on `enable_dp_attention=True` in sglang -# `python/sglang/srt/layers/dp_attention.py` -# (`compute_dp_attention_local_info` returns the full `tp_size` -# and ignores `moe_dense_tp_size` when DP-attn is off). -# So: `enable-dp-attention: true` + `dp-size: 8` (DP-attn active so -# `moe-dense-tp-size: 1` takes effect) AND no `moe-a2a-backend` set. -# The default `"none"` lands the MoE on `forward_normal` instead of -# `forward_deepep` — verified in deepseek_v2.py: -# `_enable_a2a_moe = is_deepep|is_mooncake|is_nixl|is_mori| -# is_ascend_fuseep|is_flashinfer` → False with default. - -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" - precision: "fp4" - -# Pin dynamo to the v1.2.0-sglang-deepseek-v4-dev.1 tag. The PyPI -# 0.8.0/0.8.1 releases (srtctl's default) reference `sgl.Engine` in -# `dynamo.sglang.health_check` *eagerly* (no `from __future__ import -# annotations`), and the lmsysorg/sglang:deepseek-v4-grace-blackwell -# image's sglang fork does not expose `sgl.Engine`, so they crash at -# import with `AttributeError: module 'sglang' has no attribute -# 'Engine'`. The DSV4-targeted tag adds `from __future__ import -# annotations` (commit cdb7218a, ai-dynamo PR #7255), making the -# annotation lazy so the module imports cleanly. -dynamo: - hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b - install: true - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: sglang - - # Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline - # that's actually been run upstream) plus the disaggregation timeout - # triple — heartbeat 100k matches the DSR1 sglang disagg convention. - prefill_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" - - decode_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 3072 - max-running-requests: 16 - stream-interval: 50 - decode-log-interval: 1000 - disaggregation-mode: "prefill" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 3072 - max-running-requests: 64 - cuda-graph-max-bs: 64 - stream-interval: 50 - decode-log-interval: 1000 - disaggregation-mode: "decode" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "1x4x8x16x32x64" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml deleted file mode 100644 index ced4e1e5b..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k-stale/disagg-gb200-3p1d-dep8-dep16.yaml +++ /dev/null @@ -1,179 +0,0 @@ -name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" - -# 1k/1k high-throughput topology for the wideep DSV4-Pro setup. -# -# Schema/values come from PR #1213 (513cbef) — that PR introduced the -# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` -# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our -# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't -# support either: `zip_override_*_hightpt` rejects with `Unknown field` -# and `benchmark` only validates at top level. So this file inlines the -# wideep [0] override and lifts `benchmark` back out — same operational -# values, schema the pinned srtctl will accept. -# -# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200 -# (matrix runs on gb200-nv runners, not gb300), container & model.path -# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml -# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`), -# slurm.partition + sbatch_directives + extra_mount + nginx_container -# dropped (they reference paths/partitions that exist only on the PR -# author's gb300 cluster). - -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" - precision: "fp4" - -# See ./disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. -# Hash bumped from PR #1213 to track the dynamo-sglang dsv4 dev branch. -dynamo: - hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" - install: true - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -# Topology: 3 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode -# (TP=8 / DP=8 / EP=8 / 2 nodes). 5 nodes total. -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 3 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: sglang - - prefill_environment: - SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_LOG_FORWARD_ITERS: "1" - SGLANG_LOG_MS: "1" - SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - - decode_environment: - SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_LOG_FORWARD_ITERS: "1" - SGLANG_LOG_MS: "1" - SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - watchdog-timeout: 86400 - skip-tokenizer-init: true - stream-interval: 30 - - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - - disaggregation-mode: "prefill" - disaggregation-transfer-backend: mooncake - - mem-fraction-static: 0.90 - max-running-requests: 512 - cuda-graph-max-bs: 512 - chunked-prefill-size: 32768 - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - watchdog-timeout: 86400 - skip-tokenizer-init: true - stream-interval: 30 - - # Wideep decode shape (zip_override [0] from PR #1213, inlined). - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - - enable-dp-attention: true - enable-dp-lm-head: true - - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - - disaggregation-mode: "decode" - disaggregation-transfer-backend: mooncake - - mem-fraction-static: 0.94 - swa-full-tokens-ratio: 0.15 - context-length: 16384 - max-running-requests: 9216 - cuda-graph-max-bs: 1152 - -benchmark: - type: "sa-bench" - isl: 1024 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml deleted file mode 100644 index 3a72d70f8..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml +++ /dev/null @@ -1,180 +0,0 @@ -name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" - -# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. -# -# Schema/values come from PR #1213 (513cbef) — that PR introduced the -# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` -# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our -# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't -# support either: `zip_override_*_hightpt` rejects with `Unknown field` -# and `benchmark` only validates at top level. So this file inlines the -# wideep [0] override and lifts `benchmark` back out — same operational -# values, schema the pinned srtctl will accept. -# -# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200 -# (matrix runs on gb200-nv runners, not gb300), container & model.path -# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml -# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`), -# slurm.partition + sbatch_directives + extra_mount + nginx_container -# dropped (they reference paths/partitions that exist only on the PR -# author's gb300 cluster). - -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" - precision: "fp4" - -# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin -# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 -# dev branch. -dynamo: - hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" - install: true - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode -# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 7 - decode_nodes: 2 - prefill_workers: 7 - decode_workers: 1 - gpus_per_prefill: 4 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: sglang - - prefill_environment: - SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_LOG_FORWARD_ITERS: "1" - SGLANG_LOG_MS: "1" - SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - - decode_environment: - SGLANG_DG_CACHE_DIR: "/configs/deepgemm_cache" - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" - SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - DYN_SKIP_SGLANG_LOG_FORMATTING: "1" - SGLANG_LOG_FORWARD_ITERS: "1" - SGLANG_LOG_MS: "1" - SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - watchdog-timeout: 86400 - skip-tokenizer-init: true - stream-interval: 30 - - tensor-parallel-size: 4 - data-parallel-size: 4 - expert-parallel-size: 4 - - enable-dp-attention: true - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - - disaggregation-mode: "prefill" - disaggregation-transfer-backend: mooncake - - mem-fraction-static: 0.90 - max-running-requests: 512 - cuda-graph-max-bs: 512 - chunked-prefill-size: 32768 - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - watchdog-timeout: 86400 - skip-tokenizer-init: true - stream-interval: 30 - - # Wideep decode shape (zip_override [0] from PR #1213, inlined). - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - - enable-dp-attention: true - enable-dp-lm-head: true - - moe-a2a-backend: "deepep" - deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - - disaggregation-mode: "decode" - disaggregation-transfer-backend: mooncake - - mem-fraction-static: 0.94 - swa-full-tokens-ratio: 0.15 - context-length: 16384 - max-running-requests: 9216 - cuda-graph-max-bs: 1152 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "4096x8192" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml deleted file mode 100644 index 218ad01f6..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-1p1d-dep8-tep8.yaml +++ /dev/null @@ -1,113 +0,0 @@ -name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8" - -# 8k/1k variant of the 1k/1k 1p1d-dep8-tep8 recipe. Same topology and -# tuning; only context-length grows from 3072 (1k+1k+pad) to 9280 -# (8k+1k+pad), and prefill max-running-requests halves to keep the per- -# rank prefill working set inside the GPU memory budget. -# -# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the full upstream- -# reference list (PR #69 GB200 agg, PR #75 GB300 disagg). - -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" - precision: "fp4" - -# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. -dynamo: - hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b - install: true - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 1 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: sglang - - prefill_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" - - decode_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 9280 - max-running-requests: 8 - stream-interval: 50 - decode-log-interval: 1000 - disaggregation-mode: "prefill" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 9280 - max-running-requests: 64 - cuda-graph-max-bs: 64 - stream-interval: 50 - decode-log-interval: 1000 - disaggregation-mode: "decode" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "1x4x8x16x32x64" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml deleted file mode 100644 index a1fd14571..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/stale-disagg-gb200-3p1d-dep8-dep16.yaml +++ /dev/null @@ -1,112 +0,0 @@ -name: "dsv4-sglang-disagg-gb200-3p1d-dep8-dep16" - -# 8k/1k mid-throughput topology: 3 prefill (DP=8 EP=8) + 1 wide decode -# (DP=16 EP=16). 10 nodes. Targets conc 512-1024 — 8k prompts saturate -# a single prefill worker below conc=512. -# -# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the upstream-reference -# list. Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling. - -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" - precision: "fp4" - -# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin rationale. -dynamo: - hash: 21f135f5edf40e12e6ff5db2b462d862a6d6ab9b - install: true - -slurm: - time_limit: "8:00:00" - -health_check: - max_attempts: 1440 - interval_seconds: 10 - -resources: - gpu_type: "gb200" - gpus_per_node: 4 - prefill_nodes: 6 - decode_nodes: 2 - prefill_workers: 3 - decode_workers: 1 - gpus_per_prefill: 8 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: false - -backend: - type: sglang - - prefill_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" - - decode_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024" - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 9280 - max-running-requests: 8 - stream-interval: 50 - decode-log-interval: 1000 - disaggregation-mode: "prefill" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - trust-remote-code: true - tensor-parallel-size: 8 - moe-dense-tp-size: 1 - enable-dp-attention: true - dp-size: 8 - moe-runner-backend: "flashinfer_mxfp4" - chunked-prefill-size: 4096 - disable-flashinfer-autotune: true - disable-radix-cache: true - mem-fraction-static: 0.82 - context-length: 9280 - max-running-requests: 256 - cuda-graph-max-bs: 256 - stream-interval: 50 - decode-log-interval: 1000 - disaggregation-mode: "decode" - disaggregation-bootstrap-port: 30001 - disaggregation-transfer-backend: nixl - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "512x1024" - req_rate: "inf" - use_chat_template: false From 7a1daaf4f5d0c74dde0d0552c422eab0048f222d Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 14:26:40 -0700 Subject: [PATCH 27/56] fix --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index aff5524b3..0166b3a60 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7721,7 +7721,7 @@ dsv4-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:deepseek-v4-grace-blackwell model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: gb300 + runner: gb300-cw precision: fp4 framework: dynamo-sglang multinode: true From c454ad3e919122b2a8c11aeb9397ec1e469b814a Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 15:17:35 -0700 Subject: [PATCH 28/56] test --- perf-changelog.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a8a8bab49..1a4f0b78b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1948,12 +1948,12 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1202 - config-keys: - - dsv4-fp4-gb200-dynamo-sglang + - dsv4-fp4-gb300-dynamo-sglang description: - - "Add DeepSeek-V4-Pro FP4 GB200 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)" + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)" - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" - - "Topologies mirror the dsv4-fp4-gb200-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B" - - "No upstream GB200 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB200 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" + - "Topologies mirror the dsv4-fp4-gb300-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B" + - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 - config-keys: From bac301d9ff58255821471dac3a00c20359a059ea Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 15:21:54 -0700 Subject: [PATCH 29/56] add gb300 --- runners/launch_gb300-cw.sh | 278 +++++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 runners/launch_gb300-cw.sh diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh new file mode 100644 index 000000000..1b2d27939 --- /dev/null +++ b/runners/launch_gb300-cw.sh @@ -0,0 +1,278 @@ +#!/usr/bin/bash + +# Launches multi-node Dynamo + SGLang benchmarks on the gb300-cw +# (CoreWeave) cluster. Adapted from the dynamo-vllm sibling launcher in +# the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). The SGLang +# recipes are copied exactly from the pinned srt-slurm commit below. + +set -x + +if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Weights staged on the shared VAST mount; no compute-node-local + # NVMe on cw. The exact upstream recipes refer to this model as + # `dspro`. + export MODEL_PATH="/mnt/vast/models/dsv4/" +else + echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang" + exit 1 +fi + +# CoreWeave cluster has a single `all` partition; account `cw-sup` is +# what `sacctmgr show assoc user=$USER` returns there. `benchmark` +# (inherited from gb200-nv) does not exist on cw. +export SLURM_PARTITION="all" +export SLURM_ACCOUNT="cw-sup" + +# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env +# to decide which host driver libraries (libcuda.so.1, libnvidia-*.so) +# to mount into the container. cw doesn't set them by default — without +# them the container has no libcuda and CUDA init fails. SLURM's default +# --export=ALL propagates these from this shell through sbatch+srun +# into the enroot environment. +export NVIDIA_VISIBLE_DEVICES=all +export NVIDIA_DRIVER_CAPABILITIES=compute,utility + +NGINX_IMAGE="nginx:1.27.4" +SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16" + +# Squash files live alongside models on /mnt/vast (shared across nodes). +# `squash_dupe` instead of `squash` to use '_'-separated names: srtctl / +# pyxis rejects '+' in image paths with "Invalid image format", and the +# old /mnt/vast/squash dir contains '+'-separated files from prior runs. +SQUASH_DIR="/mnt/vast/squash_dupe" +mkdir -p "$SQUASH_DIR" +SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + +enroot import -o $SQUASH_FILE docker://$IMAGE +enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE + +export EVAL_ONLY="${EVAL_ONLY:-false}" + +export ISL="$ISL" +export OSL="$OSL" + +# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. +# Without it, srtctl apply scans every YAML in the repo and submits +# hundreds of jobs. +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + +echo "Cloning srt-slurm repository..." +SRT_REPO_DIR="srt-slurm" +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" +fi + +git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +git checkout "$SRT_SLURM_RECIPES_COMMIT" + +# Overlay the local copy of the exact pinned recipes. This keeps the PR +# self-contained while preserving byte-for-byte recipe content from +# NVIDIA/srt-slurm at $SRT_SLURM_RECIPES_COMMIT. +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4" recipes/dsv4-pro/sglang/gb200-fp4 + +echo "Installing srtctl..." +# CRITICAL — uv install location. +# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is +# shared NFS across both. srtctl's slurm template (job_script_minimal.j2) +# does `if ! command -v uv` and skips its own ARM64 install when uv is +# already on PATH; on compute nodes $HOME/.local/bin is on PATH by +# default, so a stray x86 binary at $HOME/.local/bin/uv from this +# runner shadows the template's install and crashes the orchestrator +# with `cannot execute binary file: Exec format error`. Install to a +# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86 +# uv left in the shared path by prior runs. +rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx" +export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin" +mkdir -p "$XDG_BIN_HOME" +curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh +export PATH="$XDG_BIN_HOME:$PATH" + +if [ ! -x "$XDG_BIN_HOME/uv" ]; then + echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2 + exit 1 +fi +if [ -e "$HOME/.local/bin/uv" ]; then + echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2 + exit 1 +fi + +uv venv +source .venv/bin/activate +uv pip install -e . + +if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" + exit 1 +fi + +echo "Configs available at: $SRT_REPO_DIR/" + +SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +echo "Creating srtslurm.yaml configuration..." +cat > srtslurm.yaml < "$TMP_CONFIG_FILE" + mv "$TMP_CONFIG_FILE" "$CONFIG_FILE" +fi + +SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +echo "$SRTCTL_OUTPUT" + +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') + +set +x + +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi + +echo "Extracted JOB_ID: $JOB_ID" + +LOGS_DIR="outputs/$JOB_ID/logs" +LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" + +while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..." + sleep 5 +done + +( + while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do + sleep 10 + done +) & +POLL_PID=$! + +echo "Tailing LOG_FILE: $LOG_FILE" + +tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + +wait $POLL_PID + +set -x + +echo "Job $JOB_ID completed!" +echo "Collecting results..." + +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else + echo "Warning: Logs directory not found at $LOGS_DIR" +fi + +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + exit 1 + fi + + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + CONFIG_NAME=$(basename "$result_subdir") + + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi + + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi + +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi From 1167f6471b9a9118594f9b7e78dea71a2e19e299 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 15:49:07 -0700 Subject: [PATCH 30/56] upd --- .github/configs/runners.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 60f3299cf..f574c629c 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -139,3 +139,8 @@ gb300: - 'gb300-nv_0' - 'gb300-nv_1' - 'gb300-nv_2' +gb300-cw: +- 'gb300-cw_0' +- 'gb300-cw_1' +- 'gb300-cw_2' +- 'gb300-cw_3' From cfae9ae0205411250003ac11e2663e7e4227734e Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 15:54:08 -0700 Subject: [PATCH 31/56] fix --- runners/launch_gb300-cw.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 1b2d27939..a9bb8996f 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -19,7 +19,7 @@ fi # CoreWeave cluster has a single `all` partition; account `cw-sup` is # what `sacctmgr show assoc user=$USER` returns there. `benchmark` -# (inherited from gb200-nv) does not exist on cw. +# (inherited from gb300-nv) does not exist on cw. export SLURM_PARTITION="all" export SLURM_ACCOUNT="cw-sup" @@ -75,7 +75,7 @@ git checkout "$SRT_SLURM_RECIPES_COMMIT" # Overlay the local copy of the exact pinned recipes. This keeps the PR # self-contained while preserving byte-for-byte recipe content from # NVIDIA/srt-slurm at $SRT_SLURM_RECIPES_COMMIT. -cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb200-fp4" recipes/dsv4-pro/sglang/gb200-fp4 +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb300-fp4" recipes/dsv4-pro/sglang/gb300-fp4 echo "Installing srtctl..." # CRITICAL — uv install location. From 0443a1f2d4890093716a95309b85620df6041a44 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 16:01:06 -0700 Subject: [PATCH 32/56] fix --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 509a401a5..bc50f4670 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7777,7 +7777,7 @@ dsv4-fp4-gb300-dynamo-sglang: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-2p1d-dep4-dep8.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml" decode: num-worker: 1 tp: 8 From 387726da7108242896364196f09fb7a688fbca49 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 28 Apr 2026 17:07:41 -0700 Subject: [PATCH 33/56] fix --- runners/launch_gb300-cw.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index a9bb8996f..62869cb47 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -72,10 +72,13 @@ git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout "$SRT_SLURM_RECIPES_COMMIT" -# Overlay the local copy of the exact pinned recipes. This keeps the PR -# self-contained while preserving byte-for-byte recipe content from -# NVIDIA/srt-slurm at $SRT_SLURM_RECIPES_COMMIT. -cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/gb300-fp4" recipes/dsv4-pro/sglang/gb300-fp4 +# Overlay the hand-rolled DSV4 sglang recipes onto the upstream srt-slurm +# checkout. Mirrors launch_gb200-nv.sh's dynamo-sglang dsv4 branch: +# destination must be `recipes/sglang/deepseek-v4` because +# `additional-settings: CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/...` +# in `.github/configs/nvidia-master.yaml` is what srtctl loads. +mkdir -p recipes/sglang/deepseek-v4 +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 echo "Installing srtctl..." # CRITICAL — uv install location. From fe6815c2a404fac7094166a0797f8fdd6f2a1a47 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Wed, 29 Apr 2026 08:14:13 +0800 Subject: [PATCH 34/56] fix(launch_gb300-cw): register deepseek-v4-pro alias in model_paths After fixing the recipe overlay path in 1b07108, srtctl now loads our hand-rolled SGLang recipe and runs preflight, which rejects: Error: Preflight failed for recipes/sglang/.../disagg-gb300-2p1d-dep4-dep8.yaml: - model.path: Model 'deepseek-v4-pro' is not a local model path and is not defined in srtslurm.yaml model_paths. Both `disagg-gb300-2p1d-dep4-dep8.yaml` and `disagg-gb300-7p1d-dep4-dep8.yaml` declare `model.path: deepseek-v4-pro` (per the recipe header comment, the alias is intentionally aligned with `launch_gb200-nv.sh`'s srtslurm.yaml, which exports `SRT_SLURM_MODEL_PREFIX=deepseek-v4-pro`). The gb300-cw launcher only registered `dspro` and `dsv4-pro`, so the alias never resolved. Add `deepseek-v4-pro` mapping to the same `${MODEL_PATH}`. --- runners/launch_gb300-cw.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 62869cb47..a6ec57f3c 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -134,6 +134,12 @@ srtctl_root: "${SRTCTL_ROOT}" model_paths: dspro: "${MODEL_PATH}" dsv4-pro: "${MODEL_PATH}" + # Our hand-rolled DSV4 sglang recipes use `model.path: deepseek-v4-pro` + # (matches the alias in launch_gb200-nv.sh's srtslurm.yaml). Without + # this entry srtctl preflight rejects with "Model 'deepseek-v4-pro' + # is not a local model path and is not defined in srtslurm.yaml + # model_paths". + deepseek-v4-pro: "${MODEL_PATH}" containers: dynamo-trtllm: ${SQUASH_FILE} dynamo-sglang: ${SQUASH_FILE} From b4d6c1966e21f255df8889ad001c513ac4048fc4 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Wed, 29 Apr 2026 08:27:44 +0800 Subject: [PATCH 35/56] fix(launch_gb300-cw): pull arm64 squash and force fresh import per runner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After fixing model.path alias (fe6815c), the slurm orchestrator reached the head infrastructure srun and crashed at: [ERROR] Invalid image format: /mnt/vast/squash_dupe/lmsysorg_sglang_deepseek-v4-grace-blackwell.sqsh error: pyxis: failed to create container filesystem error: spank: required plugin spank_pyxis.so: task_init() failed with rc=-1 Two issues: 1. The runner pod that runs `enroot import docker://lmsysorg/sglang:...` is x86, so without `--arch` enroot fetches the amd64 manifest. The compute nodes (slurm-gb300-138-*) are aarch64 and pyxis there rejects the amd64 squash with "Invalid image format". Pass `--arch arm64` and tag the cache filename with `_arm64`. 2. `enroot import -o existing.sqsh ...` aborts with `[ERROR] File already exists` and leaves the stale file in place, so once a half-baked or pre-tag-update squash lands at this path it is silently reused on every subsequent CI run. Inspecting /mnt/vast/squash_dupe showed an Apr 26 amd64 sqsh shadowing the Apr 28 working arm64 sqsh exactly like this. `rm -f` before each import forces fresh downloads and picks up Docker tag updates. 3. Scope the squash filename per RUNNER_NAME (gb300-cw_0..3) so that the four matrix runners do not race on rm+import of the same shared path on /mnt/vast. Cost: ~64 GB on /mnt/vast (4 runners × 16 GB per arm64 sqsh) instead of 16 GB shared, which is fine on the shared VAST mount. --- runners/launch_gb300-cw.sh | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index a6ec57f3c..d9c6dbd17 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -41,11 +41,26 @@ SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16" # old /mnt/vast/squash dir contains '+'-separated files from prior runs. SQUASH_DIR="/mnt/vast/squash_dupe" mkdir -p "$SQUASH_DIR" -SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - -enroot import -o $SQUASH_FILE docker://$IMAGE -enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE +# Compute nodes (slurm-gb300-138-*, slurm-gb300-139-*) are aarch64; the +# CI runner pod that performs `enroot import` is x86. Without --arch, +# enroot pulls the host (amd64) manifest and produces a sqsh that pyxis +# on the compute node rejects with "Invalid image format". Force enroot +# to pull the arm64 manifest so the cached sqsh is portable to compute +# nodes. The `_${RUNNER_NAME}_arm64` suffix scopes the cache per runner +# (gb300-cw_0..3) so concurrent matrix jobs don't rm+import the same +# file and corrupt each other's downloads. +SQUASH_TAG="${RUNNER_NAME:-default}_arm64" +SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_${SQUASH_TAG}.sqsh" +NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_${SQUASH_TAG}.sqsh" + +# Always rebuild the squash from scratch — `enroot import` aborts with +# `[ERROR] File already exists` when targeting an existing path, so +# leaving a stale (interrupted import / pre-update tag) sqsh in place +# silently keeps using the broken file. rm + import guarantees a fresh +# import each CI run and picks up Docker tag updates. +rm -f "$SQUASH_FILE" "$NGINX_SQUASH_FILE" +enroot import --arch arm64 -o "$SQUASH_FILE" "docker://$IMAGE" +enroot import --arch arm64 -o "$NGINX_SQUASH_FILE" "docker://$NGINX_IMAGE" export EVAL_ONLY="${EVAL_ONLY:-false}" From cad94c937995e0ca9b470ff02520ac27f3a45b87 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Wed, 29 Apr 2026 08:31:21 +0800 Subject: [PATCH 36/56] fix(launch_gb300-cw): use enroot --arch aarch64, not arm64 enroot 4.0.1's `common::debarch()` accepts kernel-style arch names (`x86_64`, `aarch64`, `ppc64le`) and emits Docker-style names (`amd64`, `arm64`, `ppc64le`) on the wire. Passing `--arch arm64` (the Docker manifest name) trips the function's else branch immediately: [ERROR] Unsupported architecture: arm64 Use the kernel name `aarch64` so enroot can map it to docker's `arm64` manifest internally. --- runners/launch_gb300-cw.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index d9c6dbd17..9b00e21bd 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -59,8 +59,8 @@ NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_${SQ # silently keeps using the broken file. rm + import guarantees a fresh # import each CI run and picks up Docker tag updates. rm -f "$SQUASH_FILE" "$NGINX_SQUASH_FILE" -enroot import --arch arm64 -o "$SQUASH_FILE" "docker://$IMAGE" -enroot import --arch arm64 -o "$NGINX_SQUASH_FILE" "docker://$NGINX_IMAGE" +enroot import --arch aarch64 -o "$SQUASH_FILE" "docker://$IMAGE" +enroot import --arch aarch64 -o "$NGINX_SQUASH_FILE" "docker://$NGINX_IMAGE" export EVAL_ONLY="${EVAL_ONLY:-false}" From d6fc0e7eb026433a77143bd6c1d9c4b1b3e15794 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Wed, 29 Apr 2026 08:35:54 +0800 Subject: [PATCH 37/56] fix(launch_gb300-cw): use pre-staged arm64 sqsh, drop in-CI enroot import Even with `--arch aarch64`, `enroot import` from the CI runner pod (x86) fails when converting the arm64 image: [INFO] Converting whiteouts... /usr/bin/bash: line 1: /usr/bin/enroot-aufs2ovlfs: Operation not permitted (repeated dozens of times, then preflight reports the sqsh as missing) `enroot-aufs2ovlfs` requires CAP_SYS_ADMIN that the runner pod doesn't hold, and `lmsysorg/sglang:deepseek-v4-grace-blackwell` is arm64-only, so the conversion can't be skipped either. Per the documented manual flow at https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780 the import has to be dispatched to an aarch64 GB300 compute node via `srun`. Rather than running an extra slurm job per CI invocation just to prepare the sqsh, point the launcher at the pre-staged arm64 sqsh that already lives at `/mnt/vast/squash_dupe/lmsysorg_sglang_deepseek-v4-grace-blackwell_arm64.sqsh` (refreshed manually via the gist script when the docker tag is bumped). The matching `nginx_1.27.4_arm64.sqsh` was symlinked alongside. Add a fast-fail check so a missing pre-staged sqsh produces a clear error instead of a confusing pyxis "Invalid image format" three steps later. --- runners/launch_gb300-cw.sh | 45 ++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 9b00e21bd..5ff3fe5e9 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -42,25 +42,32 @@ SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16" SQUASH_DIR="/mnt/vast/squash_dupe" mkdir -p "$SQUASH_DIR" # Compute nodes (slurm-gb300-138-*, slurm-gb300-139-*) are aarch64; the -# CI runner pod that performs `enroot import` is x86. Without --arch, -# enroot pulls the host (amd64) manifest and produces a sqsh that pyxis -# on the compute node rejects with "Invalid image format". Force enroot -# to pull the arm64 manifest so the cached sqsh is portable to compute -# nodes. The `_${RUNNER_NAME}_arm64` suffix scopes the cache per runner -# (gb300-cw_0..3) so concurrent matrix jobs don't rm+import the same -# file and corrupt each other's downloads. -SQUASH_TAG="${RUNNER_NAME:-default}_arm64" -SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_${SQUASH_TAG}.sqsh" -NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_${SQUASH_TAG}.sqsh" - -# Always rebuild the squash from scratch — `enroot import` aborts with -# `[ERROR] File already exists` when targeting an existing path, so -# leaving a stale (interrupted import / pre-update tag) sqsh in place -# silently keeps using the broken file. rm + import guarantees a fresh -# import each CI run and picks up Docker tag updates. -rm -f "$SQUASH_FILE" "$NGINX_SQUASH_FILE" -enroot import --arch aarch64 -o "$SQUASH_FILE" "docker://$IMAGE" -enroot import --arch aarch64 -o "$NGINX_SQUASH_FILE" "docker://$NGINX_IMAGE" +# image `lmsysorg/sglang:deepseek-v4-grace-blackwell` is published as +# arm64-only. The CI runner pod is x86_64 and (a) cannot run +# `enroot import` for the arm64 manifest because `enroot-aufs2ovlfs` +# needs CAP_SYS_ADMIN that the pod lacks ("Operation not permitted"), +# and (b) even with `--arch aarch64` the conversion still fails on x86. +# Per `https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780` +# the import has to be dispatched to an arm64 compute node via srun. +# To keep CI self-contained we instead pin to the pre-staged arm64 sqsh +# under /mnt/vast/squash_dupe/ (refreshed manually by running that gist +# script when the docker tag is updated). Filename suffix `_arm64` +# distinguishes the working arm64 sqsh from any stale amd64 shadow. +SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh" +NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh" + +if [[ ! -f "$SQUASH_FILE" ]]; then + echo "ERROR: pre-staged arm64 sqsh missing: $SQUASH_FILE" >&2 + echo "Refresh it on a GB300 compute node via the script in the gist:" >&2 + echo " https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780" >&2 + exit 1 +fi +if [[ ! -f "$NGINX_SQUASH_FILE" ]]; then + echo "ERROR: pre-staged arm64 nginx sqsh missing: $NGINX_SQUASH_FILE" >&2 + echo "Run on an aarch64 host:" >&2 + echo " enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" >&2 + exit 1 +fi export EVAL_ONLY="${EVAL_ONLY:-false}" From da6f892b26d2755488c6b673e0c0d9ed6a594e3b Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Wed, 29 Apr 2026 08:48:16 +0800 Subject: [PATCH 38/56] fix(launch_gb300-cw): persist dynamo wheel cache and ulimit preamble MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-up fixes after CI started successfully reaching slurm but the dynamo-from-source step (`dynamo: hash: 9d3c913d…`) is rebuilt cold on every CI run, taking ~10-20 minutes per matrix job: 1. Cluster-wide dynamo wheel cache. srtctl's `_hash_cached_source_install` (`src/srtctl/core/schema.py:912`) is already designed to cache hash-pinned builds at `/configs/dynamo-wheels//{ai_dynamo_runtime-*.whl,dynamo-src.tar.gz,.complete}` under flock. The cache only works if `/configs/dynamo-wheels` survives between CI runs, but the launcher does `rm -rf srt-slurm` and re-clones every time, blowing it away. Mount `/mnt/vast/dynamo-wheels-cache` (NFS, shared by every gb300-cw_N runner) over `/configs/dynamo-wheels` via srtslurm.yaml `default_mounts`, so the cache survives `rm -rf` and is shared across all matrix jobs. After the first cold build the warm path should drop dynamo install to ~30 s. 2. Cluster-wide bash preamble for ulimits. yangminl's manual setup on this cluster (`/mnt/home/yangminl/srt-slurm/srtslurm.yaml`) sets `default_bash_preamble: "ulimit -n 1048576 && ulimit -a"` so the dynamo frontend / sglang servers can accept the 8192-concurrency sweep without `EMFILE: too many open files`. Mirror that here. The feature is supported by srtctl's pinned commit (`src/srtctl/core/slurm.py:_get_cluster_bash_preamble`). --- runners/launch_gb300-cw.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 5ff3fe5e9..529570e8a 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -140,6 +140,20 @@ fi echo "Configs available at: $SRT_REPO_DIR/" SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" + +# Persistent cluster-wide cache for `dynamo: hash:` source builds. The +# upstream cache root (_DYNAMO_CACHE_ROOT in srtctl/core/schema.py) is +# `/configs/dynamo-wheels`; without an override that dir lives inside +# `srt-slurm/configs`, which the launcher wipes via `rm -rf` every CI +# run, so each run does a cold ~10-20 min rust+pyo3 build. Stage the +# cache on /mnt/vast (NFS, shared by all gb300-cw_N runners) and have +# srtctl bind-mount it over `/configs/dynamo-wheels` via the cluster +# `default_mounts` setting. flock inside srtctl serializes cold-cache +# builds across concurrent matrix jobs. +DYNAMO_WHEELS_CACHE_HOST="/mnt/vast/dynamo-wheels-cache" +mkdir -p "$DYNAMO_WHEELS_CACHE_HOST" +mkdir -p configs/dynamo-wheels + echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml < Date: Wed, 29 Apr 2026 09:12:26 +0800 Subject: [PATCH 39/56] fix(sglang/dsv4/8k1k recipes): set cpus-per-task=144 for dynamo build slurm assigns 1 CPU/task by default; `scontrol show job ` from a recent CI run shows `NumCPUs=4 NumTasks=4 CPUs/Task=1` with 4 nodes, i.e. one core per worker. The dynamo `hash:` source install rebuilds ~500 rust crates (kube-client, tonic, hf-hub, image codecs ravif/exr, pyo3 stack) and at one core takes 30+ min just for the cold build, which dominates total CI time even with the new `/configs/dynamo-wheels` cache (the cache only helps after the first cold run). Match yangminl's working manual setup (`/mnt/home/yangminl/srt-slurm/recipes/dsv4-pro/sglang/gb300-fp4/all-dynamo.yaml`) which sets `sbatch_directives.cpus-per-task: "144"` so cargo gets the full GB300 host (144 cores) and finishes maturin in a few minutes. --- .../deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml | 6 ++++++ .../deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml index bceffd528..a2ad5d45b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml @@ -34,6 +34,12 @@ dynamo: slurm: time_limit: "8:00:00" +# Without cpus-per-task slurm gives 1 CPU/task; the dynamo cold source +# build (~500 rust crates including ravif/exr/zip) is otherwise serial +# and takes 30+ min. Match yangminl's all-dynamo.yaml which uses 144. +sbatch_directives: + cpus-per-task: "144" + health_check: max_attempts: 1440 interval_seconds: 10 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml index 731adeb13..8d0fae386 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml @@ -34,6 +34,12 @@ dynamo: slurm: time_limit: "8:00:00" +# Without cpus-per-task slurm gives 1 CPU/task; the dynamo cold source +# build (~500 rust crates including ravif/exr/zip) is otherwise serial +# and takes 30+ min. Match yangminl's all-dynamo.yaml which uses 144. +sbatch_directives: + cpus-per-task: "144" + health_check: max_attempts: 1440 interval_seconds: 10 From 16113f810b7ef51df3509a9bec5d97ae8537de12 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Wed, 29 Apr 2026 09:14:22 +0800 Subject: [PATCH 40/56] fix(sglang/dsv4/8k1k recipes): set cpus-per-task=144 and mem=0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit slurm assigns 1 CPU/task by default; `scontrol show job 613` from a running CI job confirmed `NumCPUs=4 NumTasks=4 CPUs/Task=1` with 4 nodes — one core per worker. The dynamo `hash:` cold source install rebuilds ~500 rust crates (kube-client, tonic, hf-hub, image codecs ravif/exr, the pyo3 stack) and at one core takes 30+ min just for the cold build, which dominates total CI time even with the new `/configs/dynamo-wheels` cache (the cache only helps after the first cold run). Match yangminl's working manual setup on the same gb300-cw cluster (`/mnt/home/yangminl/srt-slurm/recipes/dsv4-pro/sglang/gb300-fp4/all-dynamo.yaml`) which sets: sbatch_directives: cpus-per-task: "144" mem: "0" cargo then gets the full 144-core GB300 host and finishes maturin in a few minutes; mem=0 hands the worker the entire node's RAM so the dynamo build + DSV4-Pro 671B FP4 weight load fit without OOM. --- .../8k1k/disagg-gb300-2p1d-dep4-dep8.yaml | 12 +++++++++--- .../8k1k/disagg-gb300-7p1d-dep4-dep8.yaml | 12 +++++++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml index a2ad5d45b..bd5a95715 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml @@ -34,11 +34,17 @@ dynamo: slurm: time_limit: "8:00:00" -# Without cpus-per-task slurm gives 1 CPU/task; the dynamo cold source -# build (~500 rust crates including ravif/exr/zip) is otherwise serial -# and takes 30+ min. Match yangminl's all-dynamo.yaml which uses 144. +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. sbatch_directives: cpus-per-task: "144" + mem: "0" health_check: max_attempts: 1440 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml index 8d0fae386..9d59cbdc3 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml @@ -34,11 +34,17 @@ dynamo: slurm: time_limit: "8:00:00" -# Without cpus-per-task slurm gives 1 CPU/task; the dynamo cold source -# build (~500 rust crates including ravif/exr/zip) is otherwise serial -# and takes 30+ min. Match yangminl's all-dynamo.yaml which uses 144. +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. sbatch_directives: cpus-per-task: "144" + mem: "0" health_check: max_attempts: 1440 From ade5488d24f463ed169e4f8456924257879ec731 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Wed, 29 Apr 2026 10:25:05 +0800 Subject: [PATCH 41/56] fix(launch_gb300-cw): pin srt-slurm fork with parallel sa-bench The current sa-bench in NVIDIA/srt-slurm@9d75f82 generates random prompts single-threaded, which dominates 7p1d/conc=8192 bench startup (~50 min just for the 81920-prompt main pass before the first HTTP request reaches dynamo). Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers` (commit 8094cfb), which is 9d75f82 + the SemiAnalysisAI/InferenceX `utils/bench_serving/` benchmark_serving.py ported into sa-bench. With `--random-num-workers 48` (now the default in bench.sh) prompt generation drops to ~1 min on a 144-core GB300 host, putting the bench-startup cost on the same order as infra+model-load instead of dominating it. The fork is paired with the upstream PR https://github.com/NVIDIA/srt-slurm/pull/114; once that merges, this pin should revert to the bumped NVIDIA/srt-slurm SHA. --- runners/launch_gb300-cw.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 529570e8a..053cfaecf 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -33,7 +33,18 @@ export NVIDIA_VISIBLE_DEVICES=all export NVIDIA_DRIVER_CAPABILITIES=compute,utility NGINX_IMAGE="nginx:1.27.4" -SRT_SLURM_RECIPES_COMMIT="9d75f82acec163594658a440f39dd7f1bd35bd16" +# Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers` +# (= NVIDIA/srt-slurm@9d75f82 + sa-bench parallel random prompt +# generation). The single-threaded random prompt generator in the +# upstream sa-bench dominates bench startup on the 7p1d/conc=8192 +# sweep (~50 min for the main pass alone before the first HTTP +# request leaves the client). The fork bumps that to ~1 min via +# multiprocessing.Pool with `--random-num-workers 48`. +# +# TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR +# (https://github.com/NVIDIA/srt-slurm/pull/114) merges. +SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git" +SRT_SLURM_RECIPES_COMMIT="8094cfb1db7cad76fbf9ecb41c0c7e662dad301e" # Squash files live alongside models on /mnt/vast (shared across nodes). # `squash_dupe` instead of `squash` to use '_'-separated names: srtctl / @@ -90,7 +101,7 @@ if [ -d "$SRT_REPO_DIR" ]; then rm -rf "$SRT_REPO_DIR" fi -git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" +git clone "$SRT_SLURM_RECIPES_REPO" "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" git checkout "$SRT_SLURM_RECIPES_COMMIT" From 152a059d5d170799fd0b64c52c3a2f4ab99f1358 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 29 Apr 2026 10:57:26 +0800 Subject: [PATCH 42/56] fix(launch_gb300-cw): bump srt-slurm fork pin to minimal multiproc patch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous pin (8094cfb) was a wholesale replacement of sa-bench with the SemiAnalysisAI/InferenceX bench_serving — that dropped `async_request_dynamo_completions` from `ASYNC_REQUEST_FUNCS`, so `bench.sh` would have died on `--backend dynamo` argparse rejection the moment the bench client started. New pin (4249d16) is a tight ~100-line patch on top of NVIDIA/srt-slurm@9d75f82 that only adds parallel random prompt generation (`--random-num-workers`); everything else, including the dynamo backend and `--custom-tokenizer` plumbing, stays exactly the same as upstream. See https://github.com/NVIDIA/srt-slurm/pull/114. --- runners/launch_gb300-cw.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 053cfaecf..ef7260bcb 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -44,7 +44,7 @@ NGINX_IMAGE="nginx:1.27.4" # TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR # (https://github.com/NVIDIA/srt-slurm/pull/114) merges. SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git" -SRT_SLURM_RECIPES_COMMIT="8094cfb1db7cad76fbf9ecb41c0c7e662dad301e" +SRT_SLURM_RECIPES_COMMIT="4249d168208ff5ff1f30b3c1158d893cc0615bb5" # Squash files live alongside models on /mnt/vast (shared across nodes). # `squash_dupe` instead of `squash` to use '_'-separated names: srtctl / From c435a65db23bb3b247734babbdcbd1ba8438ccb8 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 29 Apr 2026 10:58:47 +0800 Subject: [PATCH 43/56] ci: temporarily comment out conc-list:[64] 2p1d entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Focus CI on the conc=8192 7p1d max-throughput entry only — re-enable the 2p1d/conc=64 mid-curve entry shortly once that's green. --- .github/configs/nvidia-master.yaml | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index bb4e5e1f4..655538929 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7723,19 +7723,21 @@ dsv4-fp4-gb300-dynamo-sglang: # ep: 1 # dp-attn: true # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - - conc-list: [64] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + # TEMPORARILY COMMENTED OUT — focusing CI on the conc=8192 7p1d + # max-throughput entry only. Re-enable shortly once that's green. + # - conc-list: [64] + # prefill: + # num-worker: 2 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true # Max throughput: 7 prefills (TP=4 / DP=4 / EP=4) + 1 decode # (TP=8 / DP=8 / EP=8 wideep). 9 nodes. Refreshed by PR #1213. - conc-list: [8192] From be12dbaceef84726e94751d4e70e25d8d45d5b8e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 29 Apr 2026 12:37:03 +0800 Subject: [PATCH 44/56] ci(eval): temporarily skip dsv4-fp4-gb300 dynamo-sglang eval-only entry The srt-slurm pin (9d75f82, recipes/dsv4-agg-disagg) lacks the lm-eval orchestrator path that lives on sa-submission-q2-2026. Skip the auto-generated eval-only matrix entry for this config until the pin is bumped. TODO: remove this branch once the pin is moved to sa-submission-q2-2026 (which already carries the EVAL_ONLY do_sweep.py branch and lm-eval/bench.sh). --- utils/matrix_logic/generate_sweep_configs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index e543bb4af..e9a2195ed 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -114,7 +114,11 @@ def _max_eval_conc(ie): ) mn_groups[key].append((i, entry)) - for entries in mn_groups.values(): + for key, entries in mn_groups.items(): + # TODO(pr1157): srt-slurm pin (9d75f82) lacks the lm-eval orchestrator path + # (only on sa-submission-q2-2026). Skip eval-only here until the pin is bumped. + if key[:3] == ("deepseek-ai/DeepSeek-V4-Pro", "gb300-cw", "dynamo-sglang"): + continue best_idx, best_entry = max(entries, key=_max_eval_conc) eval_indices.add(best_idx) # Set eval-conc to median of eligible conc values to avoid OOM during eval From 38acd774c55d3ac245f4da91dac1e92d08daceed Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 29 Apr 2026 12:17:57 +0800 Subject: [PATCH 45/56] bench(7p1d-dep4-dep8): swap sa-bench default for yangminl's gb300-cw recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the sa-bench builder (concurrencies=8192, req_rate=inf, sa-bench default num_prompts/num_warmups multipliers) with the exact custom command from yangminl's gb300-cw 8k1k_hightpt[0] run (slurm job 564 on the dsv4-pro-gb300-fp4 cluster): concurrency=4096, rate=48, num_prompts=40960, num_warmups=512, random_num_workers=96. Why mirror those exact knobs: that recipe is what produced the 7p1d reference numbers we benchmarked against (358K total tok/s, 39.9K output tok/s, ~5s mean TTFT). Running sa-bench at concurrency=8192/rate=inf will saturate the 1-decode-worker GPU (we observed 16384 concurrency on job 617 saturated decode at ~390 running/rank with mean TTFT ~257s, i.e. equilibrium gated by decode compute, not the bench), making the result not directly comparable. Bench framework note: the fzyzcjy fork's benchmark_serving.py / benchmark_utils.py / encoding_dsv4.py are byte-identical to upstream SemiAnalysisAI/InferenceX/main; only backend_request_func.py adds five per-request debug print sites (ok=/lat=/url=/plen=/err=). Throughput numbers should match sa-bench at the same flags; the fork is chosen here to keep parity with the reference run's logs. Skipped on purpose: - DeepGEMM env knobs (SGLANG_DG_CACHE_DIR / SGLANG_JIT_DEEPGEMM_PRECOMPILE vs SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1) — yangminl's cache dir is /configs/deepgemm_cache on the gb300-cw host and isn't portable here; PR's FAST_WARMUP path stays. - expert_location_dispatch.py topk_ids int32 cast (yangminl commits 94b7dc4c7 + e933ef2b1 on the patched sglang fork) — not pulling that into the container build. --- .../8k1k/disagg-gb300-7p1d-dep4-dep8.yaml | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml index 9d59cbdc3..1d08a229b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml @@ -182,9 +182,25 @@ backend: cuda-graph-max-bs: 1152 benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "8192" - req_rate: "inf" - use_chat_template: false + type: "custom" + # Mirror yangminl's gb300-cw 8k1k_hightpt[0] bench (job 564): + # concurrency=4096, rate=48, num-prompts=40960, num-warmups=512, + # random-num-workers=96. Uses upstream SemiAnalysisAI/InferenceX + # benchmark_serving.py at the same flags so this matches the + # reference run's request shape. + command: | + set -e + REPO=/configs/upstream-sa-bench/InferenceX + [ -d "$REPO" ] || git clone https://github.com/SemiAnalysisAI/InferenceX.git "$REPO" + cd "$REPO/utils/bench_serving" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \ + --num-warmups 512 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results.json From 22c5e6739040093ad9a8e0a19dd31415600950ff Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 29 Apr 2026 12:25:05 +0800 Subject: [PATCH 46/56] =?UTF-8?q?config(7p1d-dep4-dep8):=20align=20with=20?= =?UTF-8?q?job=20564=20=E2=80=94=20multi-frontend,=20sbatch=20dirs,=20name?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminate every non-cluster-specific diff vs job 564's resolved config (`/outputs/564/config_8k1k_hightpt_0.yaml`): - name: match `dsv4-pro-gb300-fp4_8k1k_hightpt_0` (was stale gb200 string) - frontend.enable_multiple_frontends: false → true; add num_additional_frontends: 8 (job 564 ran 9 dynamo frontends behind nginx; PR was running a single frontend, which was a real router-side runtime diff) - slurm.time_limit: 8h → 3h to match job 564 - sbatch_directives.cpus-per-task: 144, mem: 0 (portable, was missing) - drop health_check block (job 564 doesn't set it; rely on srtctl default) Remaining diffs vs job 564 are all either cluster-specific path bindings (slurm.partition=hpc-mid, frontend.nginx_container, extra_mount of yangminl's patched sglang) or DG-cache env (SGLANG_DG_CACHE_DIR / SGLANG_JIT_DEEPGEMM_PRECOMPILE) — those need InferenceX-cluster-side equivalents and are documented in the header comment. --- .../8k1k/disagg-gb300-7p1d-dep4-dep8.yaml | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml index 1d08a229b..afa4de33f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml @@ -1,4 +1,4 @@ -name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" +name: "dsv4-pro-gb300-fp4_8k1k_hightpt_0" # 8k/1k high-throughput topology for the wideep DSV4-Pro setup. # @@ -11,13 +11,25 @@ name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" # wideep [0] override and lifts `benchmark` back out — same operational # values, schema the pinned srtctl will accept. # -# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200 -# (matrix runs on gb200-nv runners, not gb300), container & model.path -# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml -# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`), -# slurm.partition + sbatch_directives + extra_mount + nginx_container -# dropped (they reference paths/partitions that exist only on the PR -# author's gb300 cluster). +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. model: path: "deepseek-v4-pro" @@ -32,7 +44,7 @@ dynamo: install: true slurm: - time_limit: "8:00:00" + time_limit: "03:00:00" # Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: # cpus-per-task=144 — without this slurm hands out 1 CPU/task, which @@ -46,10 +58,6 @@ sbatch_directives: cpus-per-task: "144" mem: "0" -health_check: - max_attempts: 1440 - interval_seconds: 10 - # Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode # (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. resources: @@ -64,7 +72,8 @@ resources: frontend: type: dynamo - enable_multiple_frontends: false + enable_multiple_frontends: true + num_additional_frontends: 8 backend: type: sglang From 15423f1e56c86bc6d42be584da98ecb05de5543b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Wed, 29 Apr 2026 12:25:46 +0800 Subject: [PATCH 47/56] config(7p1d-dep4-dep8): keep PR name field, revert to original --- .../sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml index afa4de33f..fc2a1ef7a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml @@ -1,4 +1,4 @@ -name: "dsv4-pro-gb300-fp4_8k1k_hightpt_0" +name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" # 8k/1k high-throughput topology for the wideep DSV4-Pro setup. # From a1a6f8d0ff4dce526a0cf0c8d0a0ee28f0d92e35 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 29 Apr 2026 12:45:06 -0700 Subject: [PATCH 48/56] upd --- .github/configs/nvidia-master.yaml | 71 ++++---- .../sglang/deepseek-v4/8k1k/conc1.yaml | 167 ++++++++++++++++++ ...300-2p1d-dep4-dep8.yaml => conc16384.yaml} | 75 ++++---- ...b300-7p1d-dep4-dep8.yaml => conc2048.yaml} | 59 +++---- 4 files changed, 262 insertions(+), 110 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-gb300-2p1d-dep4-dep8.yaml => conc16384.yaml} (75%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-gb300-7p1d-dep4-dep8.yaml => conc2048.yaml} (82%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7ad8d5864..9a4e8f39b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7736,52 +7736,45 @@ dsv4-fp4-gb300-dynamo-sglang: - isl: 8192 osl: 1024 search-space: - # Low/mid-concurrency entries (1p1d-dep8-tep8 and 3p1d-dep8-dep16 - # recipes) commented out: PR #1213 only refreshed the 7p1d-dep8-dep16 - # max-throughput recipe; the 1p1d/3p1d siblings still match the older - # operational shape and are out of scope for the PR #1213 sweep. - # # Low-concurrency: 1 prefill (TP=8) + 1 decode (TP=8). 4 nodes. - # - conc-list: [1, 4, 8, 16, 32, 64] - # prefill: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 1 - # dp-attn: true - # Mid: 3 prefills (TP=8) + 1 decode (TP=8). 8 nodes. - # TEMPORARILY COMMENTED OUT — focusing CI on the conc=8192 7p1d - # max-throughput entry only. Re-enable shortly once that's green. - # - conc-list: [64] - # prefill: - # num-worker: 2 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # Max throughput: 7 prefills (TP=4 / DP=4 / EP=4) + 1 decode - # (TP=8 / DP=8 / EP=8 wideep). 9 nodes. Refreshed by PR #1213. - - conc-list: [8192] + # Low concurrency + - conc-list: [1] prefill: - num-worker: 7 + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # Mid concurrency + - conc-list: [2048] + prefill: + num-worker: 4 tp: 4 ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true + # Max concurrency + - conc-list: [16384] + prefill: + num-worker: 14 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml new file mode 100644 index 000000000..1f1649d29 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml @@ -0,0 +1,167 @@ +name: "conc1" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 1 + decode_workers: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml similarity index 75% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml index bd5a95715..4d696ae35 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml @@ -1,4 +1,4 @@ -name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" +name: "conc16384" # 8k/1k high-throughput topology for the wideep DSV4-Pro setup. # @@ -11,13 +11,25 @@ name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" # wideep [0] override and lifts `benchmark` back out — same operational # values, schema the pinned srtctl will accept. # -# Other adjustments back to the InferenceX cluster shape: gpu_type=gb200 -# (matrix runs on gb200-nv runners, not gb300), container & model.path -# restored to the aliases mapped in launch_gb200-nv.sh's srtslurm.yaml -# (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and `deepseek-v4-pro`), -# slurm.partition + sbatch_directives + extra_mount + nginx_container -# dropped (they reference paths/partitions that exist only on the PR -# author's gb300 cluster). +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. model: path: "deepseek-v4-pro" @@ -32,7 +44,7 @@ dynamo: install: true slurm: - time_limit: "8:00:00" + time_limit: "03:00:00" # Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: # cpus-per-task=144 — without this slurm hands out 1 CPU/task, which @@ -46,25 +58,22 @@ sbatch_directives: cpus-per-task: "144" mem: "0" -health_check: - max_attempts: 1440 - interval_seconds: 10 - -# Topology: 2 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode -# (TP=8 / DP=8 / EP=8 / 2 nodes). 3 nodes total. +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 2 - decode_nodes: 2 - prefill_workers: 2 - decode_workers: 1 + prefill_nodes: 14 + prefill_workers: 14 gpus_per_prefill: 4 - gpus_per_decode: 8 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 frontend: type: dynamo - enable_multiple_frontends: false + enable_multiple_frontends: true + num_additional_frontends: 8 backend: type: sglang @@ -136,7 +145,7 @@ backend: trust-remote-code: true watchdog-timeout: 86400 skip-tokenizer-init: true - stream-interval: 30 + stream-interval: 60 tensor-parallel-size: 4 data-parallel-size: 4 @@ -159,15 +168,7 @@ backend: trust-remote-code: true watchdog-timeout: 86400 skip-tokenizer-init: true - stream-interval: 30 - - # Wideep decode shape (zip_override [0] from PR #1213, inlined). - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - - enable-dp-attention: true - enable-dp-lm-head: true + stream-interval: 60 moe-a2a-backend: "deepep" deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' @@ -178,13 +179,21 @@ backend: mem-fraction-static: 0.94 swa-full-tokens-ratio: 0.15 context-length: 16384 - max-running-requests: 9216 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 18432 cuda-graph-max-bs: 1152 + benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "64" + concurrencies: "16384" req_rate: "inf" use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml similarity index 82% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml index fc2a1ef7a..72b8babf5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-gb300-7p1d-dep4-dep8.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml @@ -1,4 +1,4 @@ -name: "dsv4-sglang-disagg-gb200-7p1d-dep8-dep16" +name: "conc2048" # 8k/1k high-throughput topology for the wideep DSV4-Pro setup. # @@ -63,11 +63,11 @@ sbatch_directives: resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 7 + prefill_nodes: 4 + prefill_workers: 4 + gpus_per_prefill: 4 decode_nodes: 2 - prefill_workers: 7 decode_workers: 1 - gpus_per_prefill: 4 gpus_per_decode: 8 frontend: @@ -145,7 +145,7 @@ backend: trust-remote-code: true watchdog-timeout: 86400 skip-tokenizer-init: true - stream-interval: 30 + stream-interval: 60 tensor-parallel-size: 4 data-parallel-size: 4 @@ -168,15 +168,7 @@ backend: trust-remote-code: true watchdog-timeout: 86400 skip-tokenizer-init: true - stream-interval: 30 - - # Wideep decode shape (zip_override [0] from PR #1213, inlined). - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 - - enable-dp-attention: true - enable-dp-lm-head: true + stream-interval: 60 moe-a2a-backend: "deepep" deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' @@ -187,29 +179,20 @@ backend: mem-fraction-static: 0.94 swa-full-tokens-ratio: 0.15 context-length: 16384 - max-running-requests: 9216 - cuda-graph-max-bs: 1152 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 3072 + cuda-graph-max-bs: 512 benchmark: - type: "custom" - # Mirror yangminl's gb300-cw 8k1k_hightpt[0] bench (job 564): - # concurrency=4096, rate=48, num-prompts=40960, num-warmups=512, - # random-num-workers=96. Uses upstream SemiAnalysisAI/InferenceX - # benchmark_serving.py at the same flags so this matches the - # reference run's request shape. - command: | - set -e - REPO=/configs/upstream-sa-bench/InferenceX - [ -d "$REPO" ] || git clone https://github.com/SemiAnalysisAI/InferenceX.git "$REPO" - cd "$REPO/utils/bench_serving" - python3 benchmark_serving.py \ - --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ - --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ - --dataset-name random \ - --random-input-len 8192 --random-output-len 1024 --random-range-ratio 0.8 \ - --random-num-workers 96 \ - --num-prompts 40960 --max-concurrency 4096 --request-rate 48 \ - --num-warmups 512 \ - --ignore-eos --trust-remote-code \ - --percentile-metrics ttft,tpot,itl,e2el \ - --save-result --result-dir /logs --result-filename results.json + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" + use_chat_template: false From b146b86cbc8daae4a68dfada183fb617edae34d0 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 29 Apr 2026 12:59:35 -0700 Subject: [PATCH 49/56] fix --- perf-changelog.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1dd575b18..4c447eaf6 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1971,6 +1971,13 @@ - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204 +- config-keys: + - minimaxm2.5-fp4-mi355x-atom + description: + - "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)" + - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042 + - config-keys: - dsv4-fp4-gb200-dynamo-vllm description: @@ -1994,3 +2001,10 @@ - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd" - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Add conc=8192 recipe for 1k1k: deepep mega_moe backend with cuda-graph-max-bs 1088, max-running-requests 8192, mem-fraction-static 0.80, swa-full-tokens-ratio 0.3, tokenizer-worker-num 16" + - "conc=8192 enables SGLANG_OPT_USE_ONLINE_COMPRESS=1 and --stream-interval 30" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1209 From c843c0df725f9c5e7c1682c69f265b462afd6673 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 29 Apr 2026 13:07:16 -0700 Subject: [PATCH 50/56] fix --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a4515e784..e0546789e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1961,7 +1961,7 @@ - "Gate --moe-backend deep_gemm_mega_moe and --gpu-memory-utilization 0.85 on DP_ATTENTION=true per the v0.20.0 recipe" - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204 - + - config-keys: - minimaxm2.5-fp4-mi355x-atom description: @@ -1969,7 +1969,7 @@ - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042 -- config-keys: +- config-keys: - dsv4-fp4-gb200-dynamo-vllm description: - "DSV4-Pro FP4 GB200 dynamo-vLLM disagg against srt-slurm aflowers/vllm-gb200-v0.20.0" From 927edfebeb94c9487685cd60a6278a8fa8b630fb Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 29 Apr 2026 15:40:19 -0700 Subject: [PATCH 51/56] middle --- .github/configs/nvidia-master.yaml | 84 ++++++-- .../sglang/deepseek-v4/8k1k/conc1024.yaml | 198 ++++++++++++++++++ .../sglang/deepseek-v4/8k1k/conc256-dp.yaml | 198 ++++++++++++++++++ .../sglang/deepseek-v4/8k1k/conc256.yaml | 167 +++++++++++++++ .../sglang/deepseek-v4/8k1k/conc512.yaml | 198 ++++++++++++++++++ 5 files changed, 831 insertions(+), 14 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0f0653ad3..af88972ea 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7737,45 +7737,101 @@ dsv4-fp4-gb300-dynamo-sglang: - isl: 8192 osl: 1024 search-space: - # Low concurrency - - conc-list: [1] + # Low-latency wideTP decode (no DP-attn): 1p1d, TP=4 prefill / TP=8 decode. 3 nodes. + - conc-list: [256] prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc256.yaml" decode: num-worker: 1 - tp: 4 + tp: 8 ep: 1 dp-attn: false - # Mid concurrency - - conc-list: [2048] + # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes. + - conc-list: [256] prefill: - num-worker: 4 + num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true - # Max concurrency - - conc-list: [16384] + # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes. + - conc-list: [512] prefill: - num-worker: 14 + num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc512.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 8 + ep: 8 dp-attn: true + # DP-attn wideep: 2p1d-dep4-dep8. 4 nodes. + - conc-list: [1024] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1024.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # # Low concurrency + # - conc-list: [1] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 1 + # dp-attn: false + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml" + # decode: + # num-worker: 1 + # tp: 4 + # ep: 1 + # dp-attn: false + # # Mid concurrency + # - conc-list: [2048] + # prefill: + # num-worker: 4 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # # Max concurrency + # - conc-list: [16384] + # prefill: + # num-worker: 14 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml new file mode 100644 index 000000000..d1f6aa2bf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml @@ -0,0 +1,198 @@ +name: "conc1024" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 3072 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml new file mode 100644 index 000000000..eac786947 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml @@ -0,0 +1,198 @@ +name: "conc256-dp" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 3072 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml new file mode 100644 index 000000000..ff628d272 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml @@ -0,0 +1,167 @@ +name: "conc256" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 8 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml new file mode 100644 index 000000000..71cfa4bc3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml @@ -0,0 +1,198 @@ +name: "conc512" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 3072 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" + use_chat_template: false From c14d06dd75a81a1f02439b41f1885165916abb06 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 29 Apr 2026 15:42:12 -0700 Subject: [PATCH 52/56] fi --- .../srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml index ff628d272..a4460a3c5 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml @@ -68,7 +68,7 @@ resources: gpus_per_prefill: 4 decode_nodes: 2 decode_workers: 1 - gpus_per_decode: 4 + gpus_per_decode: 8 frontend: type: dynamo @@ -162,6 +162,6 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "16" + concurrencies: "256" req_rate: "inf" use_chat_template: false From 5e86ffcf98dd894729acbfa64e2834445e823c4e Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 29 Apr 2026 15:49:27 -0700 Subject: [PATCH 53/56] fix --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c0c5b5e4c..f27d33ea6 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1961,14 +1961,14 @@ - "Gate --moe-backend deep_gemm_mega_moe and --gpu-memory-utilization 0.85 on DP_ATTENTION=true per the v0.20.0 recipe" - "Drop --pipeline-parallel-size 1; keep --no-enable-prefix-caching and --max-cudagraph-capture-size 2048" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1204 - + - config-keys: - minimaxm2.5-fp4-mi355x-atom description: - "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)" - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042 - + - config-keys: - dsv4-fp4-gb200-dynamo-vllm description: From 5776fd5a969bed8cc1f08d238437c549859bf0bc Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 29 Apr 2026 17:16:58 -0700 Subject: [PATCH 54/56] upd --- .github/configs/nvidia-master.yaml | 20 +-- .../sglang/deepseek-v4/8k1k/conc256.yaml | 167 ------------------ .../8k1k/{conc256-dp.yaml => conc512-20.yaml} | 14 +- 3 files changed, 10 insertions(+), 191 deletions(-) delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{conc256-dp.yaml => conc512-20.yaml} (97%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b7f0c607b..155f1b7c0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7738,34 +7738,20 @@ dsv4-fp4-gb300-dynamo-sglang: osl: 1024 search-space: # Low-latency wideTP decode (no DP-attn): 1p1d, TP=4 prefill / TP=8 decode. 3 nodes. - - conc-list: [256] + - conc-list: [512] prefill: num-worker: 1 tp: 4 ep: 1 dp-attn: false additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc256.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml" decode: num-worker: 1 - tp: 8 + tp: 16 ep: 1 dp-attn: false # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes. - - conc-list: [256] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes. - conc-list: [512] prefill: num-worker: 1 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml deleted file mode 100644 index a4460a3c5..000000000 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256.yaml +++ /dev/null @@ -1,167 +0,0 @@ -name: "conc256" - -# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. -# -# Schema/values come from PR #1213 (513cbef) — that PR introduced the -# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` -# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our -# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't -# support either: `zip_override_*_hightpt` rejects with `Unknown field` -# and `benchmark` only validates at top level. So this file inlines the -# wideep [0] override and lifts `benchmark` back out — same operational -# values, schema the pinned srtctl will accept. -# -# Other adjustments back to the InferenceX cluster shape: container & -# model.path restored to the aliases mapped in launch_gb300.sh's -# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and -# `deepseek-v4-pro`); `dynamo.install: true` added so the container -# (which has no dynamo baked in) installs from the pinned hash. -# -# Cluster-specific items NOT inlined (require InferenceX-side equivalents): -# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) -# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) -# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis -# showed only `expert_location_dispatch.py` topk_ids int32 cast is an -# active runtime diff vs container sglang; other patched files are -# env-gated dead code under the same SGLANG_OPT_* flags this yaml -# already sets. -# -# DG-related env intentionally diverged (DG cache path is host-specific): -# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) -# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) -# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. - -model: - path: "deepseek-v4-pro" - container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" - precision: "fp4" - -# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin -# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 -# dev branch. -dynamo: - hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" - install: true - -slurm: - time_limit: "03:00:00" - -# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: -# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which -# turns the dynamo `hash:` cold source build (~500 rust crates, -# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 -# cargo finishes in ~5 min. -# mem=0 — slurm's "give the whole node's memory"; needed -# for sglang loading 671B FP4 weights + dynamo build at the same -# time without OOM. -sbatch_directives: - cpus-per-task: "144" - mem: "0" - -# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode -# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. -resources: - gpu_type: "gb300" - gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 - gpus_per_prefill: 4 - decode_nodes: 2 - decode_workers: 1 - gpus_per_decode: 8 - -frontend: - type: dynamo - enable_multiple_frontends: true - num_additional_frontends: 8 - -backend: - type: sglang - - prefill_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - - decode_environment: - PYTHONUNBUFFERED: "1" - SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" - SGLANG_ENABLE_THINKING: "1" - SGLANG_REASONING_EFFORT: "max" - SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - NCCL_MNNVL_ENABLE: "1" - NCCL_CUMEM_ENABLE: "1" - SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" - MC_FORCE_MNNVL: "1" - SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" - SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" - SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" - # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 - # is single-node only and corrupts results in 2-node decode setups. - - sglang_config: - prefill: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - disable-radix-cache: true - - disaggregation-mode: "prefill" - disaggregation-transfer-backend: mooncake - - tensor-parallel-size: 4 - data-parallel-size: 1 - expert-parallel-size: 1 - - moe-runner-backend: "flashinfer_mxfp4" - disable-flashinfer-autotune: true - - mem-fraction-static: 0.90 - max-running-requests: 512 - cuda-graph-max-bs: 512 - chunked-prefill-size: 32768 - - decode: - served-model-name: "deepseek-ai/DeepSeek-V4-Pro" - model-path: "/model/" - trust-remote-code: true - disable-radix-cache: true - - disaggregation-mode: "decode" - disaggregation-transfer-backend: mooncake - - tensor-parallel-size: 8 - data-parallel-size: 1 - expert-parallel-size: 1 - - moe-runner-backend: "flashinfer_mxfp4" - disable-flashinfer-autotune: true - - mem-fraction-static: 0.9 - max-running-requests: 1024 - cuda-graph-max-bs: 512 - swa-full-tokens-ratio: 0.1 - context-length: 16384 - -benchmark: - type: "sa-bench" - isl: 8192 - osl: 1024 - concurrencies: "256" - req_rate: "inf" - use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml similarity index 97% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml index eac786947..526aa8636 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc256-dp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml @@ -1,4 +1,4 @@ -name: "conc256-dp" +name: "conc512" # 8k/1k high-throughput topology for the wideep DSV4-Pro setup. # @@ -66,9 +66,9 @@ resources: prefill_nodes: 1 prefill_workers: 1 gpus_per_prefill: 4 - decode_nodes: 2 + decode_nodes: 4 decode_workers: 1 - gpus_per_decode: 8 + gpus_per_decode: 16 frontend: type: dynamo @@ -179,9 +179,9 @@ backend: mem-fraction-static: 0.94 swa-full-tokens-ratio: 0.15 context-length: 16384 - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 enable-dp-attention: true enable-dp-lm-head: true moe-a2a-backend: deepep @@ -193,6 +193,6 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "256" + concurrencies: "512" req_rate: "inf" use_chat_template: false From fce13d0546350755ec87743b6b34450ff32bb766 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 29 Apr 2026 17:19:46 -0700 Subject: [PATCH 55/56] fix --- .github/configs/nvidia-master.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f5da3cdbf..c8d6834ca 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7762,20 +7762,20 @@ dsv4-fp4-gb300-dynamo-sglang: - isl: 8192 osl: 1024 search-space: - # Low-latency wideTP decode (no DP-attn): 1p1d, TP=4 prefill / TP=8 decode. 3 nodes. + # WideEP TP=16 decode: 1p1d-dep4-dep16. 5 nodes (4P + 16D = 20 GPUs). - conc-list: [512] prefill: num-worker: 1 tp: 4 - ep: 1 - dp-attn: false + ep: 4 + dp-attn: true additional-settings: - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml" decode: num-worker: 1 tp: 16 - ep: 1 - dp-attn: false + ep: 16 + dp-attn: true # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes. - conc-list: [512] prefill: From 484763a7fdf5b7472ce57802a7952d7b81cf5ece Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 29 Apr 2026 18:55:33 -0700 Subject: [PATCH 56/56] upd --- .github/configs/nvidia-master.yaml | 84 +++++++++++++++--------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c8d6834ca..aff249a8b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7804,45 +7804,45 @@ dsv4-fp4-gb300-dynamo-sglang: tp: 8 ep: 8 dp-attn: true - # # Low concurrency - # - conc-list: [1] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 1 - # dp-attn: false - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml" - # decode: - # num-worker: 1 - # tp: 4 - # ep: 1 - # dp-attn: false - # # Mid concurrency - # - conc-list: [2048] - # prefill: - # num-worker: 4 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # # Max concurrency - # - conc-list: [16384] - # prefill: - # num-worker: 14 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true + # Low concurrency + - conc-list: [1] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # Mid concurrency + - conc-list: [2048] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # Max concurrency + - conc-list: [16384] + prefill: + num-worker: 14 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true