Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
8242762
gb300 1k1k sglang
Oseltamivir Apr 26, 2026
ba062c0
route gb300 sglang to cw cluster
Oseltamivir Apr 26, 2026
4f7d3bc
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
c21afd3
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
7903970
connector
Oseltamivir Apr 26, 2026
26943f7
path
Oseltamivir Apr 26, 2026
e7b58f7
drop forced dynamo 0.8.1 install — use container-bundled dynamo for D…
Oseltamivir Apr 26, 2026
74d8307
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
7f38f8c
Merge remote-tracking branch 'origin/main' into gb300-1k1k-sglang
Oseltamivir Apr 26, 2026
fa52ab0
match upstream PR #75 tunings + skip srtctl dynamo install
Oseltamivir Apr 26, 2026
bc80a16
add flags
hnyls2002 Apr 26, 2026
7f43185
add more selection space
hnyls2002 Apr 26, 2026
afca046
use _arm64 image tag + squash_dupe dir for gb300-cw
Oseltamivir Apr 27, 2026
3882a55
pin dynamo to 1.2.0.dev20260426 — first arm64 wheel with DSv4 formatter
Oseltamivir Apr 27, 2026
77bbcb8
step back to dynamo dev20260425 — earlier wheel may align with contai…
Oseltamivir Apr 27, 2026
d7dc646
prebuild dynamo wheel from hash 6a159fed on /mnt/vast — mirror PR #11…
Oseltamivir Apr 27, 2026
56b64e8
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 27, 2026
5e3340c
switch disagg transport nixl → mooncake
Oseltamivir Apr 27, 2026
83867ea
strip return_routed_experts kwarg from dynamo call sites — sglang 0.5…
Oseltamivir Apr 27, 2026
3efc208
fix dynamo regex: only match whole-line kwarg passes, leave assignmen…
Oseltamivir Apr 27, 2026
9a4018c
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 27, 2026
173bd41
PR85
Oseltamivir Apr 28, 2026
5dc00ed
Import recipes
Oseltamivir Apr 28, 2026
5b88465
Merge branch 'main' into gb300-1k1k-sglang
Oseltamivir Apr 29, 2026
93cc3c3
Update perf-changelog.yaml
Oseltamivir Apr 29, 2026
81bba88
config syntext
Oseltamivir Apr 29, 2026
628f45b
Merge main into gb300 SGLang PR
Oseltamivir Apr 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7722,3 +7722,38 @@ dsv4-fp4-gb200-dynamo-vllm:
tp: 8
ep: 8
dp-attn: true

dsv4-fp4-gb300-dynamo-sglang:
# _arm64 variant: GH runner pod doing `enroot import` is amd64, but
# gb300-cw compute nodes are aarch64 (Grace). Without the explicit
# arm64 tag the registry serves the amd64 manifest, which fails to
# exec on the compute side.
image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb300-cw
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
# Uses the sa-bench recipe copied exactly from NVIDIA/srt-slurm:
# recipes/dsv4-pro/sglang/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml
# at commit 9d75f82acec163594658a440f39dd7f1bd35bd16.
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# 1 prefill worker and 3 decode workers, each TP=8.
- conc-list: [32, 64, 128, 256, 512, 1024]
prefill:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/dsv4-pro/sglang/gb200-fp4/1k1k/disagg/stp/disagg-1p3d-tp8.yaml"
decode:
num-worker: 3
tp: 8
ep: 1
dp-attn: false
5 changes: 5 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,8 @@ gb300:
- 'gb300-nv_0'
- 'gb300-nv_1'
- 'gb300-nv_2'
gb300-cw:
- 'gb300-cw_0'
- 'gb300-cw_1'
- 'gb300-cw_2'
- 'gb300-cw_3'
3 changes: 3 additions & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@ jobs:
set -x
# Export RESULT_FILENAME early so it's available for artifact uploads even if cancelled
echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
rm -f multinode_server_logs.tar.gz
rm -rf LOGS
rm -f ${RESULT_FILENAME}_*.json agg_${RESULT_FILENAME}_*.json

export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }}
export IS_MULTINODE=true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# DeepSeek-V4-Pro disaggregated on GB200 (1P3D, TP=8, MXFP4)
#
# AIME 2025 (aime25): all 30 problems, full concurrency

name: "dsv4-pro-gb200-1k1k-disagg-1p1d-tp8-aime"

dynamo:
hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"

frontend:
type: dynamo
nginx_container: nginx

model:
path: "dspro"
container: "dspro-0426-nixl"
precision: "mxfp4"

resources:
gpu_type: "gb200"
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_node: 4

health_check:
max_attempts: 360
interval_seconds: 10

backend:
type: sglang

prefill_environment:
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"

decode_environment:
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"

sglang_config:
prefill:
disaggregation-bootstrap-port: 30001
served-model-name: "dspro"
trust-remote-code: true
tensor-parallel-size: 8
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 8192
disable-flashinfer-autotune: true
max-running-requests: 1024
cuda-graph-max-bs: 2048
mem-fraction-static: 0.85

decode:
served-model-name: "dspro"
disaggregation-bootstrap-port: 30001
trust-remote-code: true
tensor-parallel-size: 8
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 8192
disable-flashinfer-autotune: true
max-running-requests: 1024
cuda-graph-max-bs: 1024
mem-fraction-static: 0.85

benchmark:
type: "aime"
aime_dataset: "aime25"
num_threads: 30
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# DeepSeek-V4-Pro disaggregated on GB200 (1P3D, TP=8, MXFP4)
#
# Some basic rate matching
# TODO: no optimizations have been applied yet

dynamo:
hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"

frontend:
type: dynamo
nginx_container: nginx

model:
path: "dspro"
container: "dspro-0426"
precision: "mxfp4"

resources:
gpu_type: "gb200"
prefill_nodes: 2
decode_nodes: 6
prefill_workers: 1
decode_workers: 3
gpus_per_node: 4

health_check:
max_attempts: 360
interval_seconds: 10

backend:
type: sglang

prefill_environment:
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"

decode_environment:
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"

sglang_config:
prefill:
disaggregation-bootstrap-port: 30001
served-model-name: "dspro"
trust-remote-code: true
tensor-parallel-size: 8
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 8192
disable-flashinfer-autotune: true
max-running-requests: 1024
cuda-graph-max-bs: 2048
mem-fraction-static: 0.85

decode:
served-model-name: "dspro"
disaggregation-bootstrap-port: 30001
trust-remote-code: true
tensor-parallel-size: 8
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 8192
disable-flashinfer-autotune: true
max-running-requests: 1024
cuda-graph-max-bs: 1024
mem-fraction-static: 0.85

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
random_range_ratio: 0.8
concurrencies: "32x64x128x256x512x1024"
req_rate: "inf"
use_chat_template: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# DeepSeek-V4-Pro disaggregated on GB200 (1P1D, TP=8, MXFP4) — 8k1k newtp + dspro-0426.
# WIP

name: "gb200-mxfp4-8k1k-disagg-newtp"

dynamo:
hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d"

frontend:
type: dynamo
nginx_container: nginx

model:
path: "dspro"
container: "dspro-0426"
precision: "mxfp4"

resources:
gpu_type: "gb200"
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_node: 4

health_check:
max_attempts: 360
interval_seconds: 10

backend:
type: sglang

prefill_environment:
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"

decode_environment:
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"

sglang_config:
prefill:
disaggregation-bootstrap-port: 30001
served-model-name: "dspro"
trust-remote-code: true
tensor-parallel-size: 8
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 8192
disable-flashinfer-autotune: true
mem-fraction-static: 0.90
max-running-requests: 512
cuda-graph-max-bs: 512
swa-full-tokens-ratio: 0.1

decode:
served-model-name: "dspro"
disaggregation-bootstrap-port: 30001
trust-remote-code: true
tensor-parallel-size: 8
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 8192
disable-flashinfer-autotune: true
mem-fraction-static: 0.90
max-running-requests: 512
cuda-graph-max-bs: 512
swa-full-tokens-ratio: 0.1

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
random_range_ratio: 0.8
concurrencies: "32x64x128x256x512"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
11 changes: 10 additions & 1 deletion perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1877,7 +1877,7 @@
- "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3"
- "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185

- config-keys:
- dsv4-fp4-b200-sglang
description:
Expand Down Expand Up @@ -1985,3 +1985,12 @@
- "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd"
- "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218

- config-keys:
- dsv4-fp4-gb300-dynamo-sglang
description:
- "Add DeepSeek-V4-Pro FP4 GB300 Dynamo SGLang disaggregated multinode configuration"
- "Image: lmsysorg/sglang:deepseek-v4-grace-blackwell_arm64 (gb300-cw compute nodes are aarch64)"
- "Topology: 1 prefill worker + 3 decode workers, TP=8, MXFP4 MoE kernels, NIXL KV transfer"
- "Recipes copied exactly from NVIDIA/srt-slurm recipes/dsv4-pro/sglang/gb200-fp4 at commit 9d75f82acec163594658a440f39dd7f1bd35bd16"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1169
Loading
Loading