Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1867,6 +1867,36 @@ dsv4-fp4-b300-sglang:
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }

# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
# DP_ATTENTION:
# dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192
# dp-attn: true -> DP-attn + deepep mega_moe + chunked-prefill 32768
# `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
# while the TP-only path leaves ep_size at the default of 1.
dsv4-fp4-b300-sglang-mtp:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
# Three CONC bands sweep with EAGLE/MTP (3/1/4) on top:
# A: TP=8 ep=1 -- conc 1-8 (latency-bound, full TP)
# B: TP=4 ep=1 -- conc 16-128 (TP-only, mid batch)
# C: TP=4 ep=4 dp-attn -- conc 64-512 (DP-attn + EP, large batch)
# Overlap: B/C at conc 64,128 (TP-only vs DP-attn EP head-to-head).
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }

qwen3.5-bf16-b200-sglang:
image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
model: Qwen/Qwen3.5-397B-A17B
Expand Down
149 changes: 149 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

# Tuning inputs from the matrix (all required):
# TP -- tensor parallel size -> --tp
# EP_SIZE -- expert parallel size -> --ep-size
# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP
# Also selects MoE backend / chunked-prefill-size:
# true -> deepep + mega_moe + chunked-prefill 32768
# false -> flashinfer_mxfp4 + chunked-prefill 8192
#
# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3,
# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands.
check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
# `hf download`. Only fetch when MODEL looks like a HF repo ID.
if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
fi

nvidia-smi

# Common SGLANG env vars (apply to every config).
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
export SGLANG_OPT_USE_JIT_NORM=1
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
export SGLANG_OPT_USE_TOPK_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1

# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
# The runner mounts our repo at a non-/workspace path for these images so the
# editable install stays visible. Paths in this script are $PWD-relative for
# that reason. Drop the runner conditional once lmsys moves sglang back out of
# /workspace.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow.
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

# MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe.
SPEC_FLAGS=(
--speculative-algorithm EAGLE
--speculative-num-steps 3
--speculative-eagle-topk 1
--speculative-num-draft-tokens 4
)

if [ "${DP_ATTENTION}" = "true" ]; then
# Large-batch EP path: deepep + mega_moe.
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--deepep-config "$DEEPEP_CONFIG"
)
CHUNKED_PREFILL_SIZE=32768
else
# Small-batch TP-only path: flashinfer_mxfp4.
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
)
CHUNKED_PREFILL_SIZE=8192
fi

# Print all SGLANG_* env vars to both the CI step log and server.log so the
# launch config is auditable from the result artifact alone.
{
echo "=== SGLANG_* env vars at launch ==="
env | grep -E '^SGLANG_' | sort
echo "==================================="
} | tee "$SERVER_LOG"

set -x
PYTHONNOUSERSITE=1 sglang serve \
--model-path $MODEL \
--host 0.0.0.0 \
--port $PORT \
--trust-remote-code \
--tp $TP \
--ep-size $EP_SIZE \
--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
--max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
--mem-fraction-static 0.90 \
--swa-full-tokens-ratio 0.1 \
"${SPEC_FLAGS[@]}" \
"${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $((CONC * 10)) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/"

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
12 changes: 12 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1875,3 +1875,15 @@
- "better performance for dp-attention"
- "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174

- config-keys:
- dsv4-fp4-b300-sglang-mtp
description:
- "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
- "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
- "Model: deepseek-ai/DeepSeek-V4-Pro"
- "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
- "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
- "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
- "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166