diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e0a42f706..aff249a8b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -7748,3 +7748,101 @@ dsv4-fp4-gb200-dynamo-vllm: tp: 8 ep: 8 dp-attn: true + +dsv4-fp4-gb300-dynamo-sglang: + image: lmsysorg/sglang:deepseek-v4-grace-blackwell + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 8192 + osl: 1024 + search-space: + # WideEP TP=16 decode: 1p1d-dep4-dep16. 5 nodes (4P + 16D = 20 GPUs). + - conc-list: [512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # DP-attn wideep: 1p1d-dep4-dep8. 3 nodes. + - conc-list: [512] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc512.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # DP-attn wideep: 2p1d-dep4-dep8. 4 nodes. + - conc-list: [1024] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1024.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # Low concurrency + - conc-list: [1] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc1.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + # Mid concurrency + - conc-list: [2048] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc2048.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # Max concurrency + - conc-list: [16384] + prefill: + num-worker: 14 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/conc16384.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 60f3299cf..f574c629c 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -139,3 +139,8 @@ gb300: - 'gb300-nv_0' - 'gb300-nv_1' - 'gb300-nv_2' +gb300-cw: +- 'gb300-cw_0' +- 'gb300-cw_1' +- 'gb300-cw_2' +- 'gb300-cw_3' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml new file mode 100644 index 000000000..1f1649d29 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1.yaml @@ -0,0 +1,167 @@ +name: "conc1" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 1 + decode_workers: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + context-length: 16384 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml new file mode 100644 index 000000000..d1f6aa2bf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc1024.yaml @@ -0,0 +1,198 @@ +name: "conc1024" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 3072 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml new file mode 100644 index 000000000..4d696ae35 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc16384.yaml @@ -0,0 +1,199 @@ +name: "conc16384" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 14 + prefill_workers: 14 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 18432 + cuda-graph-max-bs: 1152 + + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16384" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml new file mode 100644 index 000000000..72b8babf5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc2048.yaml @@ -0,0 +1,198 @@ +name: "conc2048" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 4 + prefill_workers: 4 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 3072 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml new file mode 100644 index 000000000..526aa8636 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512-20.yaml @@ -0,0 +1,198 @@ +name: "conc512" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 3072 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml new file mode 100644 index 000000000..71cfa4bc3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/conc512.yaml @@ -0,0 +1,198 @@ +name: "conc512" + +# 8k/1k high-throughput topology for the wideep DSV4-Pro setup. +# +# Schema/values come from PR #1213 (513cbef) — that PR introduced the +# `dsv4-pro-gb300-fp4` upstream-style recipe with two `zip_override` +# variants (wideep [0] / narrow_ep [1]) and `backend.benchmark`. Our +# pinned srtctl (NVIDIA/srt-slurm @ sa-submission-q2-2026) doesn't +# support either: `zip_override_*_hightpt` rejects with `Unknown field` +# and `benchmark` only validates at top level. So this file inlines the +# wideep [0] override and lifts `benchmark` back out — same operational +# values, schema the pinned srtctl will accept. +# +# Other adjustments back to the InferenceX cluster shape: container & +# model.path restored to the aliases mapped in launch_gb300.sh's +# srtslurm.yaml (`lmsysorg/sglang:deepseek-v4-grace-blackwell` and +# `deepseek-v4-pro`); `dynamo.install: true` added so the container +# (which has no dynamo baked in) installs from the pinned hash. +# +# Cluster-specific items NOT inlined (require InferenceX-side equivalents): +# - slurm.partition (yangminl's gb300-cw uses `hpc-mid`) +# - frontend.nginx_container (yangminl's `nginx-1.27.4.sqsh` path) +# - extra_mount: yangminl/sglang-patched/sglang. Earlier diff analysis +# showed only `expert_location_dispatch.py` topk_ids int32 cast is an +# active runtime diff vs container sglang; other patched files are +# env-gated dead code under the same SGLANG_OPT_* flags this yaml +# already sets. +# +# DG-related env intentionally diverged (DG cache path is host-specific): +# - SGLANG_DG_CACHE_DIR=/configs/deepgemm_cache (yangminl host) +# - SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 (yangminl uses prebuilt cache) +# This yaml uses SGLANG_JIT_DEEPGEMM_FAST_WARMUP=1 instead. + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:deepseek-v4-grace-blackwell" + precision: "fp4" + +# See ../1k1k/disagg-gb200-1p1d-dep8-tep8.yaml for the dynamo pin +# rationale. Hash bumped from PR #1213 to track the dynamo-sglang dsv4 +# dev branch. +dynamo: + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" + install: true + +slurm: + time_limit: "03:00:00" + +# Match yangminl's working all-dynamo.yaml on the same gb300-cw cluster: +# cpus-per-task=144 — without this slurm hands out 1 CPU/task, which +# turns the dynamo `hash:` cold source build (~500 rust crates, +# ravif/exr/zip/pyo3 stack) into a 30+ min serial compile. With 144 +# cargo finishes in ~5 min. +# mem=0 — slurm's "give the whole node's memory"; needed +# for sglang loading 671B FP4 weights + dynamo build at the same +# time without OOM. +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +# Topology: 7 prefill (TP=4 / DP=4 / EP=4 / 1 node each) + 1 decode +# (TP=8 / DP=8 / EP=8 / 2 nodes). 9 nodes total. +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "1152" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + DYN_SKIP_SGLANG_LOG_FORMATTING: "1" + SGLANG_LOG_FORWARD_ITERS: "1" + SGLANG_LOG_MS: "1" + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: "60" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + max-running-requests: 3072 + cuda-graph-max-bs: 512 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "512" + req_rate: "inf" + use_chat_template: false diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ed5eefa6d..ae2b88f03 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1968,7 +1968,7 @@ - "Add MiniMax-M2.5 MXFP4 MI355X Atom benchmark (rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post)" - "Single-node sweep: TP1–TP8, 1k/1k and 8k/1k ISL/OSL" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1042 - + - config-keys: - dsv4-fp4-gb200-dynamo-vllm description: @@ -2022,3 +2022,12 @@ - "Remove stale offload recipe copies and the old no-MegaMOE mid/max-throughput points from the GB200 Dynamo vLLM matrix" - "Disable FlashInfer autotune on GB200 decode workers for accuracy stability, matching the srt-slurm recipe fix" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1223 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Add DeepSeek-V4-Pro FP4 GB300 disaggregated SGLang benchmarks via Dynamo (1k/1k sweep; 8k/1k recipes shipped but commented out)" + - "Container: lmsysorg/sglang:deepseek-v4-grace-blackwell (linux/arm64); model from /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe)" + - "Topologies mirror the dsv4-fp4-gb300-dynamo-vllm sibling: low-conc 1p1d-dep8-tep8 (4 nodes), mid 1p1d-dep8-dep16 (6 nodes), high 3p1d-dep8-dep16 (10 nodes). 4096 overlap between mid and high gives a topology-crossover A/B" + - "No upstream GB300 DSV4 sglang disagg recipe exists. Per-worker sglang_config (env vars + flashinfer_mxfp4 + chunked-prefill-size 4096 + disable-flashinfer-autotune + mem-fraction-static 0.82) is mirrored from NVIDIA/srt-slurm PR #69 (recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml — GB300 DSV4 SGLang aggregated). Disagg flag set (nixl transfer backend, enable-dp-attention + moe-a2a-backend deepep) cross-checked against PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml — GB300 DSV4 SGLang disagg) and the SGLang DeepSeek-V4 cookbook. Stored under benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/ and overlaid onto the upstream srt-slurm checkout at runtime" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1157 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 333e94359..01be0fd29 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -15,6 +15,12 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SRT_SLURM_MODEL_PREFIX="dsr1-fp4" + elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Same compute-node-local NVMe path as the dynamo-vllm dsv4 + # branch — see that branch for rationale. SRT_SLURM_MODEL_PREFIX + # matches the model.path alias in our DSV4 sglang recipes. + export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" + export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" else export MODEL_PATH=$MODEL fi @@ -150,6 +156,16 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then # `recipes/vllm/deepseek-v4/deepseek-v4/...` in that case). mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 +elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then + # Mirrors the dynamo-vllm dsv4 branch above: pin to the q2-2026 + # NVIDIA srt-slurm (newer srtctl + dynamo-sglang container alias) + # and overlay our hand-rolled DSV4 sglang recipes. NVIDIA/srt-slurm + # has no upstream sglang DSV4 disagg recipes yet, hence the overlay. + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 + mkdir -p recipes/sglang/deepseek-v4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh new file mode 100644 index 000000000..ef7260bcb --- /dev/null +++ b/runners/launch_gb300-cw.sh @@ -0,0 +1,343 @@ +#!/usr/bin/bash + +# Launches multi-node Dynamo + SGLang benchmarks on the gb300-cw +# (CoreWeave) cluster. Adapted from the dynamo-vllm sibling launcher in +# the dsv4-fp4-gb300-dynamo-vllm-disagg branch (PR #1150). The SGLang +# recipes are copied exactly from the pinned srt-slurm commit below. + +set -x + +if [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then + # Weights staged on the shared VAST mount; no compute-node-local + # NVMe on cw. The exact upstream recipes refer to this model as + # `dspro`. + export MODEL_PATH="/mnt/vast/models/dsv4/" +else + echo "Unsupported model prefix/precision/framework combination on gb300-cw: $MODEL_PREFIX/$PRECISION/$FRAMEWORK. Currently supported: dsv4/fp4/dynamo-sglang" + exit 1 +fi + +# CoreWeave cluster has a single `all` partition; account `cw-sup` is +# what `sacctmgr show assoc user=$USER` returns there. `benchmark` +# (inherited from gb300-nv) does not exist on cw. +export SLURM_PARTITION="all" +export SLURM_ACCOUNT="cw-sup" + +# Pyxis/enroot's NVIDIA prestart hook reads these from the runtime env +# to decide which host driver libraries (libcuda.so.1, libnvidia-*.so) +# to mount into the container. cw doesn't set them by default — without +# them the container has no libcuda and CUDA init fails. SLURM's default +# --export=ALL propagates these from this shell through sbatch+srun +# into the enroot environment. +export NVIDIA_VISIBLE_DEVICES=all +export NVIDIA_DRIVER_CAPABILITIES=compute,utility + +NGINX_IMAGE="nginx:1.27.4" +# Pin to fzyzcjy/srt-slurm fork branch `feat/random-num-workers` +# (= NVIDIA/srt-slurm@9d75f82 + sa-bench parallel random prompt +# generation). The single-threaded random prompt generator in the +# upstream sa-bench dominates bench startup on the 7p1d/conc=8192 +# sweep (~50 min for the main pass alone before the first HTTP +# request leaves the client). The fork bumps that to ~1 min via +# multiprocessing.Pool with `--random-num-workers 48`. +# +# TODO: revert to a NVIDIA/srt-slurm pin once the upstream PR +# (https://github.com/NVIDIA/srt-slurm/pull/114) merges. +SRT_SLURM_RECIPES_REPO="https://github.com/fzyzcjy/srt-slurm.git" +SRT_SLURM_RECIPES_COMMIT="4249d168208ff5ff1f30b3c1158d893cc0615bb5" + +# Squash files live alongside models on /mnt/vast (shared across nodes). +# `squash_dupe` instead of `squash` to use '_'-separated names: srtctl / +# pyxis rejects '+' in image paths with "Invalid image format", and the +# old /mnt/vast/squash dir contains '+'-separated files from prior runs. +SQUASH_DIR="/mnt/vast/squash_dupe" +mkdir -p "$SQUASH_DIR" +# Compute nodes (slurm-gb300-138-*, slurm-gb300-139-*) are aarch64; the +# image `lmsysorg/sglang:deepseek-v4-grace-blackwell` is published as +# arm64-only. The CI runner pod is x86_64 and (a) cannot run +# `enroot import` for the arm64 manifest because `enroot-aufs2ovlfs` +# needs CAP_SYS_ADMIN that the pod lacks ("Operation not permitted"), +# and (b) even with `--arch aarch64` the conversion still fails on x86. +# Per `https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780` +# the import has to be dispatched to an arm64 compute node via srun. +# To keep CI self-contained we instead pin to the pre-staged arm64 sqsh +# under /mnt/vast/squash_dupe/ (refreshed manually by running that gist +# script when the docker tag is updated). Filename suffix `_arm64` +# distinguishes the working arm64 sqsh from any stale amd64 shadow. +SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh" +NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g')_arm64.sqsh" + +if [[ ! -f "$SQUASH_FILE" ]]; then + echo "ERROR: pre-staged arm64 sqsh missing: $SQUASH_FILE" >&2 + echo "Refresh it on a GB300 compute node via the script in the gist:" >&2 + echo " https://gist.github.com/Fridge003/42c6001e0bb613acf0e411305b8ea780" >&2 + exit 1 +fi +if [[ ! -f "$NGINX_SQUASH_FILE" ]]; then + echo "ERROR: pre-staged arm64 nginx sqsh missing: $NGINX_SQUASH_FILE" >&2 + echo "Run on an aarch64 host:" >&2 + echo " enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" >&2 + exit 1 +fi + +export EVAL_ONLY="${EVAL_ONLY:-false}" + +export ISL="$ISL" +export OSL="$OSL" + +# srt-slurm path requires a CONFIG_FILE pointing to a recipe YAML. +# Without it, srtctl apply scans every YAML in the repo and submits +# hundreds of jobs. +if [[ -z "$CONFIG_FILE" ]]; then + echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 + echo "Config: MODEL_PREFIX=${MODEL_PREFIX} PRECISION=${PRECISION} FRAMEWORK=${FRAMEWORK}" >&2 + exit 1 +fi + +echo "Cloning srt-slurm repository..." +SRT_REPO_DIR="srt-slurm" +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" +fi + +git clone "$SRT_SLURM_RECIPES_REPO" "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +git checkout "$SRT_SLURM_RECIPES_COMMIT" + +# Overlay the hand-rolled DSV4 sglang recipes onto the upstream srt-slurm +# checkout. Mirrors launch_gb200-nv.sh's dynamo-sglang dsv4 branch: +# destination must be `recipes/sglang/deepseek-v4` because +# `additional-settings: CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/...` +# in `.github/configs/nvidia-master.yaml` is what srtctl loads. +mkdir -p recipes/sglang/deepseek-v4 +cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 + +echo "Installing srtctl..." +# CRITICAL — uv install location. +# Runner pod is x86 but compute nodes are aarch64, and /mnt/home is +# shared NFS across both. srtctl's slurm template (job_script_minimal.j2) +# does `if ! command -v uv` and skips its own ARM64 install when uv is +# already on PATH; on compute nodes $HOME/.local/bin is on PATH by +# default, so a stray x86 binary at $HOME/.local/bin/uv from this +# runner shadows the template's install and crashes the orchestrator +# with `cannot execute binary file: Exec format error`. Install to a +# runner-pod-local /tmp path (tmpfs, not NFS) and scrub any stale x86 +# uv left in the shared path by prior runs. +rm -f "$HOME/.local/bin/uv" "$HOME/.local/bin/uvx" +export XDG_BIN_HOME="/tmp/uv-runner-${RUNNER_NAME:-default}/bin" +mkdir -p "$XDG_BIN_HOME" +curl -LsSf https://astral.sh/uv/install.sh | env INSTALLER_NO_MODIFY_PATH=1 sh +export PATH="$XDG_BIN_HOME:$PATH" + +if [ ! -x "$XDG_BIN_HOME/uv" ]; then + echo "ERROR: uv not at $XDG_BIN_HOME/uv after install — install script may not honor XDG_BIN_HOME on this version. Aborting before x86 uv leaks onto NFS." >&2 + exit 1 +fi +if [ -e "$HOME/.local/bin/uv" ]; then + echo "ERROR: uv install leaked to shared $HOME/.local/bin/uv. Remove it and re-run." >&2 + exit 1 +fi + +uv venv +source .venv/bin/activate +uv pip install -e . + +if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" + exit 1 +fi + +echo "Configs available at: $SRT_REPO_DIR/" + +SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" + +# Persistent cluster-wide cache for `dynamo: hash:` source builds. The +# upstream cache root (_DYNAMO_CACHE_ROOT in srtctl/core/schema.py) is +# `/configs/dynamo-wheels`; without an override that dir lives inside +# `srt-slurm/configs`, which the launcher wipes via `rm -rf` every CI +# run, so each run does a cold ~10-20 min rust+pyo3 build. Stage the +# cache on /mnt/vast (NFS, shared by all gb300-cw_N runners) and have +# srtctl bind-mount it over `/configs/dynamo-wheels` via the cluster +# `default_mounts` setting. flock inside srtctl serializes cold-cache +# builds across concurrent matrix jobs. +DYNAMO_WHEELS_CACHE_HOST="/mnt/vast/dynamo-wheels-cache" +mkdir -p "$DYNAMO_WHEELS_CACHE_HOST" +mkdir -p configs/dynamo-wheels + +echo "Creating srtslurm.yaml configuration..." +cat > srtslurm.yaml < "$TMP_CONFIG_FILE" + mv "$TMP_CONFIG_FILE" "$CONFIG_FILE" +fi + +SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +echo "$SRTCTL_OUTPUT" + +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') + +set +x + +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi + +echo "Extracted JOB_ID: $JOB_ID" + +LOGS_DIR="outputs/$JOB_ID/logs" +LOG_FILE="$LOGS_DIR/sweep_${JOB_ID}.log" + +while ! ls "$LOG_FILE" &>/dev/null; do + if ! squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; then + echo "ERROR: Job $JOB_ID failed before creating log file" + scontrol show job "$JOB_ID" + exit 1 + fi + echo "Waiting for JOB_ID $JOB_ID to begin and $LOG_FILE to appear..." + sleep 5 +done + +( + while squeue -j "$JOB_ID" --noheader 2>/dev/null | grep -q "$JOB_ID"; do + sleep 10 + done +) & +POLL_PID=$! + +echo "Tailing LOG_FILE: $LOG_FILE" + +tail -F -s 2 -n+1 "$LOG_FILE" --pid=$POLL_PID 2>/dev/null + +wait $POLL_PID + +set -x + +echo "Job $JOB_ID completed!" +echo "Collecting results..." + +if [ -d "$LOGS_DIR" ]; then + echo "Found logs directory: $LOGS_DIR" + cp -r "$LOGS_DIR" "$GITHUB_WORKSPACE/LOGS" + tar czf "$GITHUB_WORKSPACE/multinode_server_logs.tar.gz" -C "$LOGS_DIR" . +else + echo "Warning: Logs directory not found at $LOGS_DIR" +fi + +if [[ "${EVAL_ONLY:-false}" != "true" ]]; then + if [ ! -d "$LOGS_DIR" ]; then + exit 1 + fi + + RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" + else + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + CONFIG_NAME=$(basename "$result_subdir") + + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done + fi + + echo "All result files processed" +else + echo "EVAL_ONLY=true: Skipping benchmark result collection" +fi + +if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then + EVAL_DIR="$LOGS_DIR/eval_results" + if [ -d "$EVAL_DIR" ]; then + echo "Extracting eval results from $EVAL_DIR" + shopt -s nullglob + for eval_file in "$EVAL_DIR"/*; do + [ -f "$eval_file" ] || continue + cp "$eval_file" "$GITHUB_WORKSPACE/" + echo "Copied eval artifact: $(basename "$eval_file")" + done + shopt -u nullglob + else + echo "WARNING: RUN_EVAL=true but no eval results found at $EVAL_DIR" + fi +fi diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py index e543bb4af..e9a2195ed 100644 --- a/utils/matrix_logic/generate_sweep_configs.py +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -114,7 +114,11 @@ def _max_eval_conc(ie): ) mn_groups[key].append((i, entry)) - for entries in mn_groups.values(): + for key, entries in mn_groups.items(): + # TODO(pr1157): srt-slurm pin (9d75f82) lacks the lm-eval orchestrator path + # (only on sa-submission-q2-2026). Skip eval-only here until the pin is bumped. + if key[:3] == ("deepseek-ai/DeepSeek-V4-Pro", "gb300-cw", "dynamo-sglang"): + continue best_idx, best_entry = max(entries, key=_max_eval_conc) eval_indices.add(best_idx) # Set eval-conc to median of eligible conc values to avoid OOM during eval