diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 911cb503e..3a7ba3df1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1867,6 +1867,36 @@ dsv4-fp4-b300-sglang: - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } +# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is +# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by +# DP_ATTENTION: +# dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192 +# dp-attn: true -> DP-attn + deepep mega_moe + chunked-prefill 32768 +# `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, +# while the TP-only path leaves ep_size at the default of 1. +dsv4-fp4-b300-sglang-mtp: + image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + # Three CONC bands sweep with EAGLE/MTP (3/1/4) on top: + # A: TP=8 ep=1 -- conc 1-8 (latency-bound, full TP) + # B: TP=4 ep=1 -- conc 16-128 (TP-only, mid batch) + # C: TP=4 ep=4 dp-attn -- conc 64-512 (DP-attn + EP, large batch) + # Overlap: B/C at conc 64,128 (TP-only vs DP-attn EP head-to-head). + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } + qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e model: Qwen/Qwen3.5-397B-A17B diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh new file mode 100755 index 000000000..767b9a8f9 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +# Tuning inputs from the matrix (all required): +# TP -- tensor parallel size -> --tp +# EP_SIZE -- expert parallel size -> --ep-size +# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP +# Also selects MoE backend / chunked-prefill-size: +# true -> deepep + mega_moe + chunked-prefill 32768 +# false -> flashinfer_mxfp4 + chunked-prefill 8192 +# +# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3, +# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands. +check_env_vars \ + MODEL \ + TP \ + EP_SIZE \ + DP_ATTENTION \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip +# `hf download`. Only fetch when MODEL looks like a HF repo ID. +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi + +nvidia-smi + +# Common SGLANG env vars (apply to every config). +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + +# TODO(Cam): the deepseek-v4 sglang images install sglang editable at +# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. +# The runner mounts our repo at a non-/workspace path for these images so the +# editable install stays visible. Paths in this script are $PWD-relative for +# that reason. Drop the runner conditional once lmsys moves sglang back out of +# /workspace. + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} + +echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +# Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow. +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + +# MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe. +SPEC_FLAGS=( + --speculative-algorithm EAGLE + --speculative-num-steps 3 + --speculative-eagle-topk 1 + --speculative-num-draft-tokens 4 +) + +if [ "${DP_ATTENTION}" = "true" ]; then + # Large-batch EP path: deepep + mega_moe. + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + ) + CHUNKED_PREFILL_SIZE=32768 +else + # Small-batch TP-only path: flashinfer_mxfp4. + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + ) + CHUNKED_PREFILL_SIZE=8192 +fi + +# Print all SGLANG_* env vars to both the CI step log and server.log so the +# launch config is auditable from the result artifact alone. +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" + +set -x +PYTHONNOUSERSITE=1 sglang serve \ + --model-path $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --trust-remote-code \ + --tp $TP \ + --ep-size $EP_SIZE \ + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \ + --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + --mem-fraction-static 0.90 \ + --swa-full-tokens-ratio 0.1 \ + "${SPEC_FLAGS[@]}" \ + "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 589f75766..4c85924b4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1875,3 +1875,15 @@ - "better performance for dp-attention" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" + - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" + - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" + - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" + - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166