diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 05b19d802..39e299cb0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1893,9 +1893,9 @@ dsv4-fp4-b300-sglang: # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by # DP_ATTENTION: # dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192 -# dp-attn: true -> DP-attn + deepep mega_moe + chunked-prefill 32768 -# `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, -# while the TP-only path leaves ep_size at the default of 1. +# + EAGLE (3,1,4) + mem-fraction 0.90 +# dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768 +# + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 dsv4-fp4-b300-sglang-mtp: image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 model: deepseek-ai/DeepSeek-V4-Pro @@ -1904,20 +1904,21 @@ dsv4-fp4-b300-sglang-mtp: precision: fp4 framework: sglang multinode: false - # Three CONC bands sweep with EAGLE/MTP (3/1/4) on top: - # A: TP=8 ep=1 -- conc 1-8 (latency-bound, full TP) - # B: TP=4 ep=1 -- conc 16-128 (TP-only, mid batch) - # C: TP=4 ep=4 dp-attn -- conc 64-512 (DP-attn + EP, large batch) - # Overlap: B/C at conc 64,128 (TP-only vs DP-attn EP head-to-head). + # Three CONC bands: + # A: TP=8 ep=1 -- conc 1-8 EAGLE (3,1,4) TP-only fallback + # B: TP=4 ep=1 -- conc 4-32 EAGLE (3,1,4) TP-only mid batch + # C: TP=4 ep=1 dp-attn -- conc 16-256 EAGLE (1,1,2) DP-attn flashinfer seq-len-configs: - isl: 1024 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index d01f80a1d..03102778d 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -6,12 +6,12 @@ source "$(dirname "$0")/../benchmark_lib.sh" # TP -- tensor parallel size -> --tp # EP_SIZE -- expert parallel size -> --ep-size # DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP -# Also selects MoE backend / chunked-prefill-size: -# true -> deepep + mega_moe + chunked-prefill 32768 -# false -> flashinfer_mxfp4 + chunked-prefill 8192 -# -# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3, -# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands. +# Also selects MoE backend / chunked-prefill / EAGLE chain +# / mem-fraction-static / max-running-requests: +# true -> flashinfer_mxfp4 + DP-attn + chunked-prefill 32768 +# + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 +# false -> flashinfer_mxfp4 (TP-only) + chunked-prefill 8192 +# + EAGLE (3,1,4) + mem-fraction 0.90 + max-running CONC*3/2 check_env_vars \ MODEL \ TP \ @@ -63,40 +63,52 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow. +# Recipe path is selected by DP_ATTENTION; MoE backend, chunked-prefill, EAGLE +# chain, mem-fraction, and max-running all follow. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -# MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe. -SPEC_FLAGS=( - --speculative-algorithm EAGLE - --speculative-num-steps 3 - --speculative-eagle-topk 1 - --speculative-num-draft-tokens 4 -) - if [ "${DP_ATTENTION}" = "true" ]; then - # Large-batch EP path: deepep + mega_moe. - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + # DP-attn path: flashinfer_mxfp4 + DP-attn (covers conc 16-256). + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 export SGLANG_OPT_USE_FAST_MASK_EP=1 export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + SPEC_FLAGS=( + --speculative-algorithm EAGLE + --speculative-num-steps 1 + --speculative-eagle-topk 1 + --speculative-num-draft-tokens 2 + ) PARALLEL_ARGS=( --dp-size "$TP" --enable-dp-attention - --moe-a2a-backend deepep + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune --deepep-config "$DEEPEP_CONFIG" + --cuda-graph-max-bs 256 ) CHUNKED_PREFILL_SIZE=32768 + MEM_FRACTION_STATIC=0.92 + MAX_RUNNING_REQUESTS=256 else - # Small-batch TP-only path: flashinfer_mxfp4. + # TP-only fallback for low-conc: flashinfer_mxfp4 + EAGLE (3,1,4). + SPEC_FLAGS=( + --speculative-algorithm EAGLE + --speculative-num-steps 3 + --speculative-eagle-topk 1 + --speculative-num-draft-tokens 4 + ) PARALLEL_ARGS=( --moe-runner-backend flashinfer_mxfp4 --disable-flashinfer-autotune ) CHUNKED_PREFILL_SIZE=8192 + MEM_FRACTION_STATIC=0.90 + MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" fi # Print all SGLANG_* env vars to both the CI step log and server.log so the @@ -116,8 +128,8 @@ PYTHONNOUSERSITE=1 sglang serve \ --tp $TP \ --ep-size $EP_SIZE \ --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \ - --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ - --mem-fraction-static 0.90 \ + --max-running-requests "$MAX_RUNNING_REQUESTS" \ + --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio 0.1 \ "${SPEC_FLAGS[@]}" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 22b6743b5..a29c278f2 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1907,3 +1907,14 @@ - "ISL=8192: TP8 conc 4-32; DP8 (dp-attn) conc 64-1024" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1156 +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" + - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" + - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" + - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" + - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1180