Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1832,33 +1832,40 @@ dsr1-fp8-b300-sglang:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }

# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while
# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro
# on B300. Re-introduce balanced/max-throughput rows once fixed upstream.
# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# lists B200 (not B300) as the Blackwell target. This config reuses the
# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
# until a B300-specific recipe ships. Prefix caching is disabled.
# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300
image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
# TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP)
# while the DeepEP FP8 weight-postprocess path is broken for this
# checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3
# integers. raised from sglang.srt.layers.quantization.fp8
# .process_weights_after_loading_block_quant). Full concurrency sweep
# retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300
# once sglang can load the checkpoint under --moe-a2a-backend deepep.
# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC:
# low-latency (CONC <= 32): TP-only
# balanced (32 < CONC <= 128): + DP-attn
# max-throughput (CONC > 128): + DP-attn
# Split so result filenames (ep=, dpa=) accurately reflect the recipe.
# ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
# while low-latency leaves ep_size at the default of 1.
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }

# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
Expand Down
86 changes: 63 additions & 23 deletions benchmarks/single_node/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ source "$(dirname "$0")/../benchmark_lib.sh"
check_env_vars \
MODEL \
TP \
DP_ATTENTION \
CONC \
ISL \
OSL \
Expand All @@ -23,12 +24,13 @@ fi

nvidia-smi

# Common SGLANG env vars (apply to every config).
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0

# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its
# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half
# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks.
unset CUDA_VISIBLE_DEVICES
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
export SGLANG_OPT_USE_JIT_NORM=1
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
export SGLANG_OPT_USE_TOPK_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1

# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
Expand All @@ -40,7 +42,7 @@ unset CUDA_VISIBLE_DEVICES
SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -50,21 +52,57 @@ fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
# Restore the CONC-based low-latency / balanced / max-throughput dispatch
# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under
# --moe-a2a-backend deepep.
RECIPE=low-latency
RECIPE_FLAGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 4096
--disable-flashinfer-autotune
--mem-fraction-static 0.82
)
echo "Recipe: $RECIPE (CONC=$CONC)"
# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was
# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
if [[ "$ISL" == "1024" ]]; then
SWA_FULL_TOKENS_RATIO=0.5
else
SWA_FULL_TOKENS_RATIO=0.1
fi

# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
# script's pattern). DP-attention runs the empirically-tuned high-concurrency
# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer);
# single-instance uses flashinfer_mxfp4 with the cookbook defaults.
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

# Default; the DP-attn branch below overrides to 0.94.
MEM_FRACTION_STATIC=0.90

if [ "${DP_ATTENTION}" = "true" ]; then
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
Comment on lines +81 to +89
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 The DP_ATTENTION=true branch (lines 81-89) builds PARALLEL_ARGS without --moe-a2a-backend deepep, so the max-throughput recipe at conc=512 runs with ep_size=1 (the default) instead of ep=4 despite nvidia-master.yaml declaring { tp: 4, ep: 4, dp-attn: true } and the perf-changelog advertising "max-throughput (TP=4, EP=4, DP-attn + DeepEP)". Result filenames will carry ep=4-dpa=true while sglang actually runs EP=1, mislabeling the artifact. To fix: add --moe-a2a-backend deepep (and drop --moe-runner-backend flashinfer_mxfp4, which is the non-EP backend) to match the sibling dsv4_fp4_b300_sglang_mtp.sh (lines 86-91) and dsv4_fp4_b200.sh (lines 63-69).

Extended reasoning...

What the bug is

The DP-attention branch in benchmarks/single_node/dsv4_fp4_b300_sglang.sh builds PARALLEL_ARGS like this:

if [ "${DP_ATTENTION}" = "true" ]; then
    ...
    PARALLEL_ARGS=(
        --dp-size "$TP"
        --enable-dp-attention
        --moe-runner-backend flashinfer_mxfp4
        --disable-flashinfer-autotune
        --deepep-config "$DEEPEP_CONFIG"
        --chunked-prefill-size 16384
        --enable-prefill-delayer
    )
    ...
fi

The --moe-a2a-backend deepep flag is missing. Per the YAML comment added by this same PR (.github/configs/nvidia-master.yaml:1854):

ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, while low-latency leaves ep_size at the default of 1.

Without that flag, sglang leaves ep_size at the default of 1, so the run never actually exercises EP=4 + DeepEP.

How it manifests / step-by-step proof

  1. The YAML row is { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }. The orchestrator sets TP=4, EP_SIZE=4, DP_ATTENTION=true, CONC=512 in env.
  2. The script enters the DP_ATTENTION=true branch and assembles PARALLEL_ARGS without --moe-a2a-backend deepep.
  3. sglang serve is invoked with --tp 4 --dp-size 4 --enable-dp-attention --moe-runner-backend flashinfer_mxfp4 --deepep-config ... but no --moe-a2a-backend deepep. Sglang therefore leaves ep_size=1 (default) and the --deepep-config is dead config because no DeepEP backend was selected.
  4. The benchmark template (benchmark-tmpl.yml:146) bakes EP_SIZE and DP_ATTENTION into the result filename via ep${EP_SIZE} and dpa=${DP_ATTENTION} suffixes — so the output carries ep=4-dpa=true even though the runtime is actually EP=1.

Why existing code does not prevent it

Nothing in this script reads EP_SIZE or asserts it against the actual sglang flags. The shell script just passes through TP from env and constructs --dp-size "$TP"; EP_SIZE only appears in the result filename, never in the launch command. The two sister scripts get this right — dsv4_fp4_b300_sglang_mtp.sh:89 and dsv4_fp4_b200.sh:66 both pass --moe-a2a-backend deepep and omit --moe-runner-backend flashinfer_mxfp4 in their DP-attention branches, because deepep handles MoE on its own.

Impact

The recipe-per-CONC max-throughput row at conc=512 (the headline row of this PR) does not actually exercise the configuration the YAML and perf-changelog advertise. The artifact filenames are mislabeled (ep=4-dpa=true vs. actual EP=1), so any downstream pareto analysis comparing this row against dsv4-fp4-b200-sglang's real EP=8 + DeepEP run will draw incorrect conclusions about B300 vs B200. This defeats the entire purpose of the recipe-per-CONC split that this PR introduces.

Suggested fix

Mirror the sister scripts: add --moe-a2a-backend deepep and drop --moe-runner-backend flashinfer_mxfp4 in the DP_ATTENTION=true branch. The TP-only else branch correctly keeps flashinfer_mxfp4. Alternatively, if the intent really is no DeepEP for B300, update the YAML row to ep: 1 and the perf-changelog to drop the "+ DeepEP" claim — but the former is the obviously-correct fix given the YAML comment and sister scripts.

MEM_FRACTION_STATIC=0.94
else
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
)
fi

# Print all SGLANG_* env vars to both the CI step log and server.log so the
# launch config is auditable from the result artifact alone.
{
echo "=== SGLANG_* env vars at launch ==="
env | grep -E '^SGLANG_' | sort
echo "==================================="
} | tee "$SERVER_LOG"

set -x
PYTHONNOUSERSITE=1 sglang serve \
Expand All @@ -73,8 +111,10 @@ PYTHONNOUSERSITE=1 sglang serve \
--port $PORT \
--trust-remote-code \
--tp $TP \
--disable-radix-cache \
"${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
--max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
--mem-fraction-static "$MEM_FRACTION_STATIC" \
--swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
"${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1869,3 +1869,12 @@
- "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Recipe-per-CONC split for DeepSeek-V4-Pro on B300: low-latency (TP=8, EP=1), balanced (TP=4, EP=1) at conc=32, max-throughput (TP=4, EP=4, DP-attn + DeepEP) at conc=512, for both 1k1k and 8k1k"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
Comment thread
cquil11 marked this conversation as resolved.
- "Image pinned to lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3"
- "DP-attention path enables SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 for better SWA eviction behavior"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1185

Loading