Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
26e540d
feat: add DeepSeek-V4-Flash FP4 B300 SGLang benchmark
cquil11 Apr 24, 2026
efdc8ba
fix: switch dsv4-fp4-b300-sglang to Pro + Max-Throughput recipe
cquil11 Apr 24, 2026
cc35a12
chore: sync launch_b200-dgxc-slurm.sh cache mount from claude/add-dsv…
cquil11 Apr 24, 2026
404a097
fix: restore trailing whitespace stripped from glm5.1 changelog entry
cquil11 Apr 24, 2026
97a488e
chore: add flock-guarded squash import to B300 runner
cquil11 Apr 24, 2026
106deea
fix: drop ENROOT_CACHE_PATH override from B300 runner
cquil11 Apr 24, 2026
4bb1f1a
chore: point B300 runner at shared gharunners/{squash,hf-hub-cache}
cquil11 Apr 24, 2026
744c5a0
fix: move enroot import out of srun to avoid pyxis namespace collision
cquil11 Apr 24, 2026
d003c59
fix: wipe stale pyxis scratch dirs for this JOB_ID before benchmark srun
cquil11 Apr 24, 2026
f00629f
Revert: drop all B300 runner changes, mirror #1128's approach
cquil11 Apr 24, 2026
570b0eb
runner: add head-node flock-guarded squash import on B300
cquil11 Apr 24, 2026
864419d
fix: mount at /ix and clear baked-in CUDA_VISIBLE_DEVICES
cquil11 Apr 24, 2026
5d93913
Merge branch 'main' into chore/dsv4-sgl-b300
cquil11 Apr 24, 2026
9453676
runner: use /data/models pre-staged path for dsv4 on B300
cquil11 Apr 24, 2026
5db43b8
fix: switch B300 dsv4 sglang to bw-ultra-compiled image
cquil11 Apr 24, 2026
c060c58
fix: switch B300 dsv4 sglang image to yhyang201/sglang-b300:v3
cquil11 Apr 24, 2026
08edf26
update b300
cquil11 Apr 24, 2026
a699ca0
feat(dsv4-fp4-b300-sglang): pick recipe by CONC; split search-space
cquil11 Apr 24, 2026
d35696c
update b300
cquil11 Apr 24, 2026
bc43672
feat(dsv4-fp4-b300-sglang): hardcode low-latency recipe at every CONC
cquil11 Apr 24, 2026
87c8376
trigger test check
cquil11 Apr 25, 2026
aa423f0
Merge branch 'main' into chore/dsv4-sgl-b300
cquil11 Apr 25, 2026
90e8f3d
Revert "feat(dsv4-fp4-b300-sglang): hardcode low-latency recipe at ev…
cquil11 Apr 25, 2026
8e3158d
trigger test check
cquil11 Apr 25, 2026
623baa1
Move dsv4 b300 sglang bench script to framework-tagged path
cquil11 Apr 25, 2026
54b2ced
chore(perf-changelog): tighten dsv4-fp4-b300-sglang entry
cquil11 Apr 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 25 additions & 12 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1832,9 +1832,11 @@ dsr1-fp8-b300-sglang:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }

# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while
# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro
# on B300. Re-introduce balanced/max-throughput rows once fixed upstream.
# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# lists B200 (not B300) as the Blackwell target. This config reuses the
# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
# until a B300-specific recipe ships. Prefix caching is disabled.
# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300
model: deepseek-ai/DeepSeek-V4-Pro
Expand All @@ -1843,22 +1845,33 @@ dsv4-fp4-b300-sglang:
precision: fp4
framework: sglang
multinode: false
# TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP)
# while the DeepEP FP8 weight-postprocess path is broken for this
# checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3
# integers. raised from sglang.srt.layers.quantization.fp8
# .process_weights_after_loading_block_quant). Full concurrency sweep
# retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300
# once sglang can load the checkpoint under --moe-a2a-backend deepep.
# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC:
# low-latency (CONC <= 32): TP-only
# balanced (32 < CONC <= 128): + DP-attn
# max-throughput (CONC > 128): + DP-attn
# Split so result filenames (ep=, dpa=) accurately reflect the recipe.
# ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
# while low-latency leaves ep_size at the default of 1.
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
# low-latency
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
# balanced
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
# max-throughput
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
# low-latency
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
# balanced
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
# max-throughput
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }

qwen3.5-bf16-b200-sglang:
image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
Expand Down
54 changes: 40 additions & 14 deletions benchmarks/single_node/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,46 @@ fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
# Restore the CONC-based low-latency / balanced / max-throughput dispatch
# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under
# --moe-a2a-backend deepep.
RECIPE=low-latency
RECIPE_FLAGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 4096
--disable-flashinfer-autotune
--mem-fraction-static 0.82
)
# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune
# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128
# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

if [[ $CONC -le 32 ]]; then
RECIPE=low-latency
RECIPE_FLAGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 4096
--disable-flashinfer-autotune
--mem-fraction-static 0.82
)
elif [[ $CONC -le 128 ]]; then
RECIPE=balanced
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
RECIPE_FLAGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--deepep-config "$DEEPEP_CONFIG"
--mem-fraction-static 0.82
--cuda-graph-max-bs 64
--max-running-requests 128
)
else
RECIPE=max-throughput
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
RECIPE_FLAGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--deepep-config "$DEEPEP_CONFIG"
--mem-fraction-static 0.82
--cuda-graph-max-bs 64
--max-running-requests 256
)
fi
echo "Recipe: $RECIPE (CONC=$CONC)"

set -x
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1812,3 +1812,10 @@
- "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern"
- "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132
Loading