diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1d467308f..42c720a63 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1832,9 +1832,11 @@ dsr1-fp8-b300-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } -# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while -# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro -# on B300. Re-introduce balanced/max-throughput rows once fixed upstream. +# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# lists B200 (not B300) as the Blackwell target. This config reuses the +# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300 +# until a B300-specific recipe ships. Prefix caching is disabled. +# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: image: lmsysorg/sglang:deepseek-v4-b300 model: deepseek-ai/DeepSeek-V4-Pro @@ -1843,22 +1845,33 @@ dsv4-fp4-b300-sglang: precision: fp4 framework: sglang multinode: false - # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP) - # while the DeepEP FP8 weight-postprocess path is broken for this - # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3 - # integers. raised from sglang.srt.layers.quantization.fp8 - # .process_weights_after_loading_block_quant). Full concurrency sweep - # retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300 - # once sglang can load the checkpoint under --moe-a2a-backend deepep. + # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 + # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC: + # low-latency (CONC <= 32): TP-only + # balanced (32 < CONC <= 128): + DP-attn + # max-throughput (CONC > 128): + DP-attn + # Split so result filenames (ep=, dpa=) accurately reflect the recipe. + # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, + # while low-latency leaves ep_size at the default of 1. seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index c9fb238a5..faa946174 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -50,20 +50,46 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the -# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300 -# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from -# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant). -# Restore the CONC-based low-latency / balanced / max-throughput dispatch -# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under -# --moe-a2a-backend deepep. -RECIPE=low-latency -RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 - --disable-flashinfer-autotune - --mem-fraction-static 0.82 -) +# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): +# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune +# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 +# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + +if [[ $CONC -le 32 ]]; then + RECIPE=low-latency + RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 + ) +elif [[ $CONC -le 128 ]]; then + RECIPE=balanced + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 128 + ) +else + RECIPE=max-throughput + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 256 + ) +fi echo "Recipe: $RECIPE (CONC=$CONC)" set -x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a6c811748..397da6591 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1812,3 +1812,10 @@ - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern" - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132