From 148223d4af01567af9ff7af893fcdba8cc1d6f14 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sat, 25 Apr 2026 20:50:05 -0700 Subject: [PATCH 01/17] sglang dsv4 mtp --- .../single_node/dsv4_fp4_b300_sglang_mtp.sh | 170 ++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100755 benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh new file mode 100755 index 000000000..4383c408f --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip +# `hf download`. Only fetch when MODEL looks like a HF repo ID. +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi + +nvidia-smi + +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + +# TODO(Cam): the deepseek-v4 sglang images install sglang editable at +# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. +# The runner mounts our repo at a non-/workspace path for these images so the +# editable install stays visible. Paths in this script are $PWD-relative for +# that reason. Drop the runner conditional once lmsys moves sglang back out of +# /workspace. + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# with EAGLE / MTP enabled: +# - low-latency (CONC <= 32): TP-only, flashinfer_mxfp4 MoE +# - balanced (32 < CONC <= 128): + DP-attn, mega-moe EP +# - max-throughput (CONC > 128): + DP-attn, mega-moe EP, max-running-requests=512 +# Speculative-decoding flags follow the cookbook EAGLE config. +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + +# MTP (EAGLE) speculative-decoding flags applied to every recipe. +SPEC_FLAGS=( + --speculative-algorithm EAGLE + --speculative-num-steps 3 + --speculative-eagle-topk 1 + --speculative-num-draft-tokens 4 +) + +if [[ $CONC -le 32 ]]; then + RECIPE=low-latency + export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 + # common optimizations + export SGLANG_OPT_USE_JIT_NORM=1 + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 + export SGLANG_OPT_USE_TOPK_V2=1 + export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 32768 + --disable-flashinfer-autotune + --mem-fraction-static 0.90 + --max-running-requests 32 + --swa-full-tokens-ratio 0.1 + ) +elif [[ $CONC -le 128 ]]; then + RECIPE=balanced + export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 + # common optimizations + export SGLANG_OPT_USE_JIT_NORM=1 + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 + export SGLANG_OPT_USE_TOPK_V2=1 + export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + # MoE EP related flags + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.83 + --max-running-requests 128 + --chunked-prefill-size 32768 + --swa-full-tokens-ratio 0.1 + ) +else + RECIPE=max-throughput + export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 + # common optimizations + export SGLANG_OPT_USE_JIT_NORM=1 + export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 + export SGLANG_OPT_USE_TOPK_V2=1 + export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 + # MoE EP related flags + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.90 + --max-running-requests 512 + --chunked-prefill-size 32768 + --swa-full-tokens-ratio 0.1 + ) +fi +echo "Recipe: $RECIPE (CONC=$CONC)" + +set -x +PYTHONNOUSERSITE=1 sglang serve \ + --model-path $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --trust-remote-code \ + --tp $TP \ + "${SPEC_FLAGS[@]}" \ + "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" \ + --use-chat-template + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x From c883e8dd66585f4ac6c8ef21357ef2e610fbcc37 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sat, 25 Apr 2026 21:02:14 -0700 Subject: [PATCH 02/17] knob-driven recipe selection --- .../single_node/dsv4_fp4_b300_sglang_mtp.sh | 144 ++++++++---------- 1 file changed, 67 insertions(+), 77 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 4383c408f..0ac9d017d 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -2,9 +2,25 @@ source "$(dirname "$0")/../benchmark_lib.sh" +# Tuning knobs (matrix-driven, all required - no script-side defaults): +# TP -- tensor parallel size -> --tp +# EP_SIZE -- expert parallel size -> --ep-size +# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP +# MOE_RUNNER_BACKEND -- recipe label, one of: deepep | flashinfer_mxfp4 +# deepep -> --moe-a2a-backend deepep + mega_moe env vars +# flashinfer_mxfp4 -> --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune +# CHUNKED_PREFILL_SIZE -- --chunked-prefill-size value (e.g. 8192, 32768) +# +# MTP/EAGLE speculative-decoding flags are applied unconditionally on top of +# every recipe (same draft chain across CONC ranges). Tuning the spec config +# per recipe is left as future work once we have sweep data. check_env_vars \ MODEL \ TP \ + EP_SIZE \ + DP_ATTENTION \ + MOE_RUNNER_BACKEND \ + CHUNKED_PREFILL_SIZE \ CONC \ ISL \ OSL \ @@ -23,7 +39,13 @@ fi nvidia-smi +# Common SGLANG env vars (apply to every config). export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 +export SGLANG_OPT_USE_JIT_NORM=1 +export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 +export SGLANG_OPT_USE_TOPK_V2=1 +export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. @@ -35,7 +57,7 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, MOE_RUNNER_BACKEND: $MOE_RUNNER_BACKEND, CHUNKED_PREFILL_SIZE: $CHUNKED_PREFILL_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -45,12 +67,7 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# with EAGLE / MTP enabled: -# - low-latency (CONC <= 32): TP-only, flashinfer_mxfp4 MoE -# - balanced (32 < CONC <= 128): + DP-attn, mega-moe EP -# - max-throughput (CONC > 128): + DP-attn, mega-moe EP, max-running-requests=512 -# Speculative-decoding flags follow the cookbook EAGLE config. +# Recipe path is selected by MOE_RUNNER_BACKEND. DP-attention applies orthogonally below. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' # MTP (EAGLE) speculative-decoding flags applied to every recipe. @@ -61,76 +78,44 @@ SPEC_FLAGS=( --speculative-num-draft-tokens 4 ) -if [[ $CONC -le 32 ]]; then - RECIPE=low-latency - export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 - # common optimizations - export SGLANG_OPT_USE_JIT_NORM=1 - export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 - export SGLANG_OPT_USE_TOPK_V2=1 - export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 32768 - --disable-flashinfer-autotune - --mem-fraction-static 0.90 - --max-running-requests 32 - --swa-full-tokens-ratio 0.1 - ) -elif [[ $CONC -le 128 ]]; then - RECIPE=balanced - export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 - # common optimizations - export SGLANG_OPT_USE_JIT_NORM=1 - export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 - export SGLANG_OPT_USE_TOPK_V2=1 - export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - # MoE EP related flags - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.83 - --max-running-requests 128 - --chunked-prefill-size 32768 - --swa-full-tokens-ratio 0.1 - ) -else - RECIPE=max-throughput - export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1 - # common optimizations - export SGLANG_OPT_USE_JIT_NORM=1 - export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1 - export SGLANG_OPT_USE_TOPK_V2=1 - export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 - # MoE EP related flags - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.90 - --max-running-requests 512 - --chunked-prefill-size 32768 - --swa-full-tokens-ratio 0.1 - ) +case "${MOE_RUNNER_BACKEND}" in + deepep) + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + PARALLEL_ARGS=( + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + ) + ;; + flashinfer_mxfp4) + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + ) + ;; + *) + echo "ERROR: unknown MOE_RUNNER_BACKEND='${MOE_RUNNER_BACKEND}' (expected: deepep | flashinfer_mxfp4)" >&2 + exit 1 + ;; +esac + +# DP-attention is orthogonal to MOE_RUNNER_BACKEND. +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention) fi -echo "Recipe: $RECIPE (CONC=$CONC)" + +# Print all SGLANG_* env vars to both the CI step log and server.log so the +# launch config is auditable from the result artifact alone. +{ + echo "=== SGLANG_* env vars at launch ===" + env | grep -E '^SGLANG_' | sort + echo "===================================" +} | tee "$SERVER_LOG" set -x PYTHONNOUSERSITE=1 sglang serve \ @@ -139,8 +124,13 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ + --ep-size $EP_SIZE \ + --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \ + --max-running-requests "$((CONC * 3 / 2))" \ + --mem-fraction-static 0.90 \ + --swa-full-tokens-ratio 0.1 \ "${SPEC_FLAGS[@]}" \ - "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & SERVER_PID=$! From 3a49ed12d0aa44c79141d8ba573390230f7275b2 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sat, 25 Apr 2026 21:09:48 -0700 Subject: [PATCH 03/17] self-contained mtp config; recipe via dp-attn --- .github/configs/nvidia-master.yaml | 29 ++++++++ .../single_node/dsv4_fp4_b300_sglang_mtp.sh | 69 ++++++++----------- perf-changelog.yaml | 11 +++ 3 files changed, 70 insertions(+), 39 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 42c720a63..0351ab754 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1873,6 +1873,35 @@ dsv4-fp4-b300-sglang: # max-throughput - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } +# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is +# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by +# DP_ATTENTION: +# dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192 +# dp-attn: true -> DP-attn + deepep mega_moe + chunked-prefill 32768 +# `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, +# while the TP-only path leaves ep_size at the default of 1. +dsv4-fp4-b300-sglang-mtp: + image: lmsysorg/sglang:deepseek-v4-b300 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp } + qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e model: Qwen/Qwen3.5-397B-A17B diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 0ac9d017d..deac9b9ca 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -3,13 +3,12 @@ source "$(dirname "$0")/../benchmark_lib.sh" # Tuning knobs (matrix-driven, all required - no script-side defaults): -# TP -- tensor parallel size -> --tp -# EP_SIZE -- expert parallel size -> --ep-size -# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP -# MOE_RUNNER_BACKEND -- recipe label, one of: deepep | flashinfer_mxfp4 -# deepep -> --moe-a2a-backend deepep + mega_moe env vars -# flashinfer_mxfp4 -> --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune -# CHUNKED_PREFILL_SIZE -- --chunked-prefill-size value (e.g. 8192, 32768) +# TP -- tensor parallel size -> --tp +# EP_SIZE -- expert parallel size -> --ep-size +# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP +# Also selects MoE backend / chunked-prefill-size: +# true -> deepep + mega_moe + chunked-prefill 32768 +# false -> flashinfer_mxfp4 + chunked-prefill 8192 # # MTP/EAGLE speculative-decoding flags are applied unconditionally on top of # every recipe (same draft chain across CONC ranges). Tuning the spec config @@ -19,8 +18,6 @@ check_env_vars \ TP \ EP_SIZE \ DP_ATTENTION \ - MOE_RUNNER_BACKEND \ - CHUNKED_PREFILL_SIZE \ CONC \ ISL \ OSL \ @@ -57,7 +54,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, MOE_RUNNER_BACKEND: $MOE_RUNNER_BACKEND, CHUNKED_PREFILL_SIZE: $CHUNKED_PREFILL_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -67,7 +64,7 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# Recipe path is selected by MOE_RUNNER_BACKEND. DP-attention applies orthogonally below. +# Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' # MTP (EAGLE) speculative-decoding flags applied to every recipe. @@ -78,35 +75,29 @@ SPEC_FLAGS=( --speculative-num-draft-tokens 4 ) -case "${MOE_RUNNER_BACKEND}" in - deepep) - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - PARALLEL_ARGS=( - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - ) - ;; - flashinfer_mxfp4) - PARALLEL_ARGS=( - --moe-runner-backend flashinfer_mxfp4 - --disable-flashinfer-autotune - ) - ;; - *) - echo "ERROR: unknown MOE_RUNNER_BACKEND='${MOE_RUNNER_BACKEND}' (expected: deepep | flashinfer_mxfp4)" >&2 - exit 1 - ;; -esac - -# DP-attention is orthogonal to MOE_RUNNER_BACKEND. if [ "${DP_ATTENTION}" = "true" ]; then - PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention) + # Large-batch EP path: deepep + mega_moe. + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + ) + CHUNKED_PREFILL_SIZE=32768 +else + # Small-batch TP-only path: flashinfer_mxfp4. + PARALLEL_ARGS=( + --moe-runner-backend flashinfer_mxfp4 + --disable-flashinfer-autotune + ) + CHUNKED_PREFILL_SIZE=8192 fi # Print all SGLANG_* env vars to both the CI step log and server.log so the diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7ed3c16ff..12278037e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1833,3 +1833,14 @@ - "Bump --chunked-prefill-size from 4096 to 8192" - "Retrigger dsv4-fp8-mi355x-sglang" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1160 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" + - "Image: lmsysorg/sglang:deepseek-v4-b300" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "EAGLE flags: num-steps=3, eagle-topk=1, num-draft-tokens=4" + - "Recipe selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" + - "Configs: 1k1k and 8k1k, tp 4/8 with conc 1-512" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 From 6f1b80a639f81e332005ff1c10f04f825b35bf04 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sat, 25 Apr 2026 21:20:24 -0700 Subject: [PATCH 04/17] add mtp_1 (1/1/2) variant --- .github/configs/nvidia-master.yaml | 13 +++++ .../single_node/dsv4_fp4_b300_sglang_mtp.sh | 52 ++++++++++++------- perf-changelog.yaml | 2 +- runners/launch_b300-nv.sh | 2 +- utils/matrix_logic/validation.py | 8 +-- 5 files changed, 53 insertions(+), 24 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0351ab754..c0d118895 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1888,19 +1888,32 @@ dsv4-fp4-b300-sglang-mtp: precision: fp4 framework: sglang multinode: false + # Two EAGLE chain lengths sweep side-by-side per (tp, ep, dp-attn) combo: + # mtp -> num-steps=3, eagle-topk=1, num-draft-tokens=4 (default chain) + # mtp_1 -> num-steps=1, eagle-topk=1, num-draft-tokens=2 (single-step) seq-len-configs: - isl: 1024 osl: 1024 search-space: + # mtp (3/1/4) - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp } + # mtp_1 (1/1/2) + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp_1 } + - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp_1 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp_1 } - isl: 8192 osl: 1024 search-space: + # mtp (3/1/4) - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp } - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp } + # mtp_1 (1/1/2) + - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp_1 } + - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp_1 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp_1 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index deac9b9ca..56a9d6899 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -3,21 +3,21 @@ source "$(dirname "$0")/../benchmark_lib.sh" # Tuning knobs (matrix-driven, all required - no script-side defaults): -# TP -- tensor parallel size -> --tp -# EP_SIZE -- expert parallel size -> --ep-size -# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP -# Also selects MoE backend / chunked-prefill-size: -# true -> deepep + mega_moe + chunked-prefill 32768 -# false -> flashinfer_mxfp4 + chunked-prefill 8192 -# -# MTP/EAGLE speculative-decoding flags are applied unconditionally on top of -# every recipe (same draft chain across CONC ranges). Tuning the spec config -# per recipe is left as future work once we have sweep data. +# TP -- tensor parallel size -> --tp +# EP_SIZE -- expert parallel size -> --ep-size +# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP +# Also selects MoE backend / chunked-prefill-size: +# true -> deepep + mega_moe + chunked-prefill 32768 +# false -> flashinfer_mxfp4 + chunked-prefill 8192 +# SPEC_DECODING -- selects EAGLE chain length: +# mtp -> num-steps=3, eagle-topk=1, num-draft-tokens=4 (default) +# mtp_1 -> num-steps=1, eagle-topk=1, num-draft-tokens=2 (single-step) check_env_vars \ MODEL \ TP \ EP_SIZE \ DP_ATTENTION \ + SPEC_DECODING \ CONC \ ISL \ OSL \ @@ -54,7 +54,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, SPEC_DECODING: $SPEC_DECODING, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -67,13 +67,29 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv" # Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -# MTP (EAGLE) speculative-decoding flags applied to every recipe. -SPEC_FLAGS=( - --speculative-algorithm EAGLE - --speculative-num-steps 3 - --speculative-eagle-topk 1 - --speculative-num-draft-tokens 4 -) +# MTP (EAGLE) speculative-decoding flags. Chain length selected by SPEC_DECODING. +case "${SPEC_DECODING}" in + mtp_1) + SPEC_FLAGS=( + --speculative-algorithm EAGLE + --speculative-num-steps 1 + --speculative-eagle-topk 1 + --speculative-num-draft-tokens 2 + ) + ;; + mtp) + SPEC_FLAGS=( + --speculative-algorithm EAGLE + --speculative-num-steps 3 + --speculative-eagle-topk 1 + --speculative-num-draft-tokens 4 + ) + ;; + *) + echo "ERROR: unsupported SPEC_DECODING='${SPEC_DECODING}' (expected: mtp | mtp_1)" >&2 + exit 1 + ;; +esac if [ "${DP_ATTENTION}" = "true" ]; then # Large-batch EP path: deepep + mega_moe. diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 12278037e..f5970f126 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1840,7 +1840,7 @@ - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" - "Image: lmsysorg/sglang:deepseek-v4-b300" - "Model: deepseek-ai/DeepSeek-V4-Pro" - - "EAGLE flags: num-steps=3, eagle-topk=1, num-draft-tokens=4" + - "Two EAGLE chain lengths swept side-by-side per (tp, ep, dp-attn) combo: mtp=3/1/4 (default) and mtp_1=1/1/2 (single-step)" - "Recipe selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" - "Configs: 1k1k and 8k1k, tp 4/8 with conc 1-512" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 3c855e805..27760df4b 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -259,7 +259,7 @@ else export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro" fi SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + SPEC_SUFFIX=$([[ "$SPEC_DECODING" == mtp* ]] && printf '_mtp' || printf '') # Prefer a framework-tagged script (e.g. dsv4_fp4_b300_sglang.sh) so models # with multiple inference engines can coexist; fall back to the historical # name without an engine suffix (`_trt` for trt, bare for everyone else) diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index ce10840b5..9210e0b07 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -77,7 +77,7 @@ class SingleNodeMatrixEntry(BaseModel): model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) precision: str framework: str - spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field( alias=Fields.SPEC_DECODING.value ) runner: str @@ -116,7 +116,7 @@ class MultiNodeMatrixEntry(BaseModel): model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) precision: str framework: str - spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field( alias=Fields.SPEC_DECODING.value ) runner: str @@ -204,7 +204,7 @@ class SingleNodeSearchSpaceEntry(BaseModel): tp: int ep: Optional[int] = None - spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field( default="none", alias=Fields.SPEC_DECODING.value) dp_attn: Optional[bool] = Field( default=None, alias=Fields.DP_ATTN.value) @@ -224,7 +224,7 @@ class MultiNodeSearchSpaceEntry(BaseModel): """Multinode search space configuration.""" model_config = ConfigDict(extra='forbid', populate_by_name=True) - spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field( default="none", alias=Fields.SPEC_DECODING.value) prefill: WorkerConfig decode: WorkerConfig From 1b34a8d0d3068914a5965aafa7a2225028c012d6 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sat, 25 Apr 2026 21:34:15 -0700 Subject: [PATCH 05/17] knob-driven recipe selection --- .github/configs/nvidia-master.yaml | 33 +++++++++++++----------------- perf-changelog.yaml | 8 +++++--- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c0d118895..0b09dc048 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1888,32 +1888,27 @@ dsv4-fp4-b300-sglang-mtp: precision: fp4 framework: sglang multinode: false - # Two EAGLE chain lengths sweep side-by-side per (tp, ep, dp-attn) combo: - # mtp -> num-steps=3, eagle-topk=1, num-draft-tokens=4 (default chain) - # mtp_1 -> num-steps=1, eagle-topk=1, num-draft-tokens=2 (single-step) + # Four configs dispatched by CONC, with overlap at transitions: + # A: TP=8 ep=1, mtp (3/1/4) -- conc 1-8 (latency-bound) + # B: TP=4 ep=1, mtp (3/1/4) -- conc 16-128 (TP-only mid batch) + # C: TP=4 ep=4 dp-attn, mtp (3/1/4) -- conc 64-256 (DP-attn + EP) + # D: TP=4 ep=4 dp-attn, mtp_1 (1/1/2) -- conc 256-512 (short spec at large batch) + # Overlaps: B/C at conc 64,128 (TP-only vs DP-attn EP); C/D at 256 (3/1/4 vs 1/1/2). seq-len-configs: - isl: 1024 osl: 1024 search-space: - # mtp (3/1/4) - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp } - # mtp_1 (1/1/2) - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp_1 } - - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp_1 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp_1 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp_1 } - isl: 8192 osl: 1024 search-space: - # mtp (3/1/4) - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp } - # mtp_1 (1/1/2) - - { tp: 8, ep: 1, conc-start: 1, conc-end: 32, spec-decoding: mtp_1 } - - { tp: 4, ep: 1, conc-start: 1, conc-end: 64, spec-decoding: mtp_1 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 32, conc-end: 512, spec-decoding: mtp_1 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } + - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp_1 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f5970f126..e82104fdc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1840,7 +1840,9 @@ - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" - "Image: lmsysorg/sglang:deepseek-v4-b300" - "Model: deepseek-ai/DeepSeek-V4-Pro" - - "Two EAGLE chain lengths swept side-by-side per (tp, ep, dp-attn) combo: mtp=3/1/4 (default) and mtp_1=1/1/2 (single-step)" - - "Recipe selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" - - "Configs: 1k1k and 8k1k, tp 4/8 with conc 1-512" + - "Four configs dispatched by CONC: A=TP8/mtp (1-8), B=TP4/mtp (16-128), C=DP4/mtp (64-256), D=DP4/mtp_1 (256-512)" + - "Overlaps for head-to-head comparison: B/C at conc 64,128; C/D at conc 256" + - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn" + - "EAGLE chain selected by spec-decoding: mtp=3/1/4 vs mtp_1=1/1/2" + - "Configs: 1k1k and 8k1k, total 26 sweep entries" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 From 481482ac44cbe021795176f21c462606419ea250 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sat, 25 Apr 2026 22:03:14 -0700 Subject: [PATCH 06/17] pin sglang image to mega_moe-capable digest --- .github/configs/nvidia-master.yaml | 2 +- perf-changelog.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0b09dc048..35a36b728 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1881,7 +1881,7 @@ dsv4-fp4-b300-sglang: # `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, # while the TP-only path leaves ep_size at the default of 1. dsv4-fp4-b300-sglang-mtp: - image: lmsysorg/sglang:deepseek-v4-b300 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e82104fdc..7541a81ec 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1838,7 +1838,7 @@ - dsv4-fp4-b300-sglang-mtp description: - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" - - "Image: lmsysorg/sglang:deepseek-v4-b300" + - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234 (pinned for deep_gemm transform_weights_for_mega_moe support)" - "Model: deepseek-ai/DeepSeek-V4-Pro" - "Four configs dispatched by CONC: A=TP8/mtp (1-8), B=TP4/mtp (16-128), C=DP4/mtp (64-256), D=DP4/mtp_1 (256-512)" - "Overlaps for head-to-head comparison: B/C at conc 64,128; C/D at conc 256" From 47fefec19307f8d77558701d3e312365a2ad2c4c Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Sat, 25 Apr 2026 22:18:47 -0700 Subject: [PATCH 07/17] drop mtp_1 knob; align with PR #1158 image digest --- .github/configs/nvidia-master.yaml | 19 +++---- .../single_node/dsv4_fp4_b300_sglang_mtp.sh | 53 +++++++------------ perf-changelog.yaml | 11 ++-- runners/launch_b300-nv.sh | 2 +- utils/matrix_logic/validation.py | 8 +-- 5 files changed, 36 insertions(+), 57 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 35a36b728..ee3d4dc9e 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1881,34 +1881,31 @@ dsv4-fp4-b300-sglang: # `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, # while the TP-only path leaves ep_size at the default of 1. dsv4-fp4-b300-sglang-mtp: - image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234 + image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 precision: fp4 framework: sglang multinode: false - # Four configs dispatched by CONC, with overlap at transitions: - # A: TP=8 ep=1, mtp (3/1/4) -- conc 1-8 (latency-bound) - # B: TP=4 ep=1, mtp (3/1/4) -- conc 16-128 (TP-only mid batch) - # C: TP=4 ep=4 dp-attn, mtp (3/1/4) -- conc 64-256 (DP-attn + EP) - # D: TP=4 ep=4 dp-attn, mtp_1 (1/1/2) -- conc 256-512 (short spec at large batch) - # Overlaps: B/C at conc 64,128 (TP-only vs DP-attn EP); C/D at 256 (3/1/4 vs 1/1/2). + # Three CONC bands sweep with EAGLE/MTP (3/1/4) on top: + # A: TP=8 ep=1 -- conc 1-8 (latency-bound, full TP) + # B: TP=4 ep=1 -- conc 16-128 (TP-only, mid batch) + # C: TP=4 ep=4 dp-attn -- conc 64-512 (DP-attn + EP, large batch) + # Overlap: B/C at conc 64,128 (TP-only vs DP-attn EP head-to-head). seq-len-configs: - isl: 1024 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp_1 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp_1 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 56a9d6899..7f012c5b2 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -2,22 +2,21 @@ source "$(dirname "$0")/../benchmark_lib.sh" -# Tuning knobs (matrix-driven, all required - no script-side defaults): -# TP -- tensor parallel size -> --tp -# EP_SIZE -- expert parallel size -> --ep-size -# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP -# Also selects MoE backend / chunked-prefill-size: -# true -> deepep + mega_moe + chunked-prefill 32768 -# false -> flashinfer_mxfp4 + chunked-prefill 8192 -# SPEC_DECODING -- selects EAGLE chain length: -# mtp -> num-steps=3, eagle-topk=1, num-draft-tokens=4 (default) -# mtp_1 -> num-steps=1, eagle-topk=1, num-draft-tokens=2 (single-step) +# Tuning inputs from the matrix (all required): +# TP -- tensor parallel size -> --tp +# EP_SIZE -- expert parallel size -> --ep-size +# DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP +# Also selects MoE backend / chunked-prefill-size: +# true -> deepep + mega_moe + chunked-prefill 32768 +# false -> flashinfer_mxfp4 + chunked-prefill 8192 +# +# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3, +# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands. check_env_vars \ MODEL \ TP \ EP_SIZE \ DP_ATTENTION \ - SPEC_DECODING \ CONC \ ISL \ OSL \ @@ -54,7 +53,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1 SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, SPEC_DECODING: $SPEC_DECODING, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -67,29 +66,13 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv" # Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -# MTP (EAGLE) speculative-decoding flags. Chain length selected by SPEC_DECODING. -case "${SPEC_DECODING}" in - mtp_1) - SPEC_FLAGS=( - --speculative-algorithm EAGLE - --speculative-num-steps 1 - --speculative-eagle-topk 1 - --speculative-num-draft-tokens 2 - ) - ;; - mtp) - SPEC_FLAGS=( - --speculative-algorithm EAGLE - --speculative-num-steps 3 - --speculative-eagle-topk 1 - --speculative-num-draft-tokens 4 - ) - ;; - *) - echo "ERROR: unsupported SPEC_DECODING='${SPEC_DECODING}' (expected: mtp | mtp_1)" >&2 - exit 1 - ;; -esac +# MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe. +SPEC_FLAGS=( + --speculative-algorithm EAGLE + --speculative-num-steps 3 + --speculative-eagle-topk 1 + --speculative-num-draft-tokens 4 +) if [ "${DP_ATTENTION}" = "true" ]; then # Large-batch EP path: deepep + mega_moe. diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7541a81ec..7dfa95310 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1838,11 +1838,10 @@ - dsv4-fp4-b300-sglang-mtp description: - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" - - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:d44a693204aea7995349a76d400190fbeb1662379fe874e81d151bdbe85e2234 (pinned for deep_gemm transform_weights_for_mega_moe support)" + - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)" - "Model: deepseek-ai/DeepSeek-V4-Pro" - - "Four configs dispatched by CONC: A=TP8/mtp (1-8), B=TP4/mtp (16-128), C=DP4/mtp (64-256), D=DP4/mtp_1 (256-512)" - - "Overlaps for head-to-head comparison: B/C at conc 64,128; C/D at conc 256" - - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn" - - "EAGLE chain selected by spec-decoding: mtp=3/1/4 vs mtp_1=1/1/2" - - "Configs: 1k1k and 8k1k, total 26 sweep entries" + - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4" + - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" + - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" + - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 27760df4b..3c855e805 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -259,7 +259,7 @@ else export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro" fi SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - SPEC_SUFFIX=$([[ "$SPEC_DECODING" == mtp* ]] && printf '_mtp' || printf '') + SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') # Prefer a framework-tagged script (e.g. dsv4_fp4_b300_sglang.sh) so models # with multiple inference engines can coexist; fall back to the historical # name without an engine suffix (`_trt` for trt, bare for everyone else) diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 9210e0b07..ce10840b5 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -77,7 +77,7 @@ class SingleNodeMatrixEntry(BaseModel): model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) precision: str framework: str - spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field( + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( alias=Fields.SPEC_DECODING.value ) runner: str @@ -116,7 +116,7 @@ class MultiNodeMatrixEntry(BaseModel): model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) precision: str framework: str - spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field( + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( alias=Fields.SPEC_DECODING.value ) runner: str @@ -204,7 +204,7 @@ class SingleNodeSearchSpaceEntry(BaseModel): tp: int ep: Optional[int] = None - spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field( + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( default="none", alias=Fields.SPEC_DECODING.value) dp_attn: Optional[bool] = Field( default=None, alias=Fields.DP_ATTN.value) @@ -224,7 +224,7 @@ class MultiNodeSearchSpaceEntry(BaseModel): """Multinode search space configuration.""" model_config = ConfigDict(extra='forbid', populate_by_name=True) - spec_decoding: Literal["mtp", "mtp_1", "draft_model", "none"] = Field( + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( default="none", alias=Fields.SPEC_DECODING.value) prefill: WorkerConfig decode: WorkerConfig From 287ef26124bb16b71a61b11e47ad858afebe385c Mon Sep 17 00:00:00 2001 From: Yuhao Yang <47235274+yhyang201@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:37:49 +0800 Subject: [PATCH 08/17] update nvidia-master.yaml --- .github/configs/nvidia-master.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ee3d4dc9e..6acb8967b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1898,14 +1898,10 @@ dsv4-fp4-b300-sglang-mtp: osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 16, conc-end: 128, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 512, spec-decoding: mtp } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e From f64505b9ed22dc0f603570530cbc7ad70aac0b6c Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 17:50:18 +0800 Subject: [PATCH 09/17] fix: restore trailing newline in perf-changelog.yaml --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7ec0bf8aa..5ac45fdff 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1873,4 +1873,4 @@ - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)" - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128" - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 \ No newline at end of file + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166 From 4f468d68012d7c611107f2a8279716be63df7af1 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 20:09:48 +0800 Subject: [PATCH 10/17] fix: remove --use-chat-template and floor --max-running-requests at 8 The tokenizer for DSv4-Pro has no chat_template set, so --use-chat-template causes benchmark_serving.py to crash with ValueError. Remove it to align with dsv4_fp4_b300_sglang.sh. Also add a floor of 8 to --max-running-requests to match the base script and avoid too-low values at low concurrency. --- benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 7f012c5b2..767b9a8f9 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -116,7 +116,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --tp $TP \ --ep-size $EP_SIZE \ --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \ - --max-running-requests "$((CONC * 3 / 2))" \ + --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ --mem-fraction-static 0.90 \ --swa-full-tokens-ratio 0.1 \ "${SPEC_FLAGS[@]}" \ @@ -138,8 +138,7 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir "$PWD/" \ - --use-chat-template + --result-dir "$PWD/" if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" From fc93e84bda55448f4d30006b15eb7e99b6f0bbb1 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 20:13:11 +0800 Subject: [PATCH 11/17] perf-changelog: add dsv4-fp4-b300-sglang-mtp entry Rebase perf-changelog.yaml on latest main (preserving #1173 and #1174 entries) and append the MTP config entry for PR #1166. --- perf-changelog.yaml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5ac45fdff..4c85924b4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1862,8 +1862,21 @@ - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 - -- config-keys: + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "better performance for dp-attention" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174 + +- config-keys: - dsv4-fp4-b300-sglang-mtp description: - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding" From cea70e55a839265e04009155e85a60ef5fe45e99 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 21:02:24 +0800 Subject: [PATCH 12/17] dsv4-b300-sglang: add conc=2048 8k1k recipe with finite request-rate Add an ultra-high-concurrency DP-attention recipe (TP=8, deepep mega_moe, chunked-prefill 65536, request-rate 16) for the 8k1k workload at conc=2048. To support finite request-rate, make benchmark_lib.sh's run_benchmark_serving() accept an optional --request-rate parameter (defaults to inf so all existing callers are unaffected). Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 1 + benchmarks/benchmark_lib.sh | 7 +++- .../single_node/dsv4_fp4_b300_sglang.sh | 32 +++++++++++++++++-- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3a7ba3df1..7b31bfe29 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1866,6 +1866,7 @@ dsv4-fp4-b300-sglang: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 268745735..9845ee38c 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -205,6 +205,7 @@ run_benchmark_serving() { local use_chat_template=false local dsv4=false local trust_remote_code=false + local request_rate="inf" local server_pid="" while [[ $# -gt 0 ]]; do @@ -266,6 +267,10 @@ run_benchmark_serving() { trust_remote_code=true shift ;; + --request-rate) + request_rate="$2" + shift 2 + ;; --server-pid) server_pid="$2" shift 2 @@ -347,7 +352,7 @@ run_benchmark_serving() { --random-range-ratio "$random_range_ratio" --num-prompts "$num_prompts" --max-concurrency "$max_concurrency" - --request-rate inf + --request-rate "$request_rate" --ignore-eos "${profile_flag[@]}" --save-result diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index ac552c733..0d4940918 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -66,10 +66,35 @@ fi # single-instance uses flashinfer_mxfp4 with the cookbook defaults. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -# Default; the DP-attn branch below overrides to 0.94. +# Default; the DP-attn branches below override per recipe. MEM_FRACTION_STATIC=0.90 +MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" +REQUEST_RATE="inf" -if [ "${DP_ATTENTION}" = "true" ]; then +if [ "${DP_ATTENTION}" = "true" ] && [ "$CONC" -ge 2048 ]; then + # Ultra-high-concurrency DP-attention recipe: TP=8, deepep mega_moe backend. + export SGLANG_LOG_FORWARD_ITERS=1 + export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 + export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 + export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 + export SGLANG_OPT_USE_FAST_MASK_EP=1 + export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 + export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288 + export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 + PARALLEL_ARGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --cuda-graph-max-bs 288 + --deepep-config "$DEEPEP_CONFIG" + --chunked-prefill-size 65536 + --enable-prefill-delayer + ) + MEM_FRACTION_STATIC=0.87 + MAX_RUNNING_REQUESTS=2560 + REQUEST_RATE=16 +elif [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 @@ -111,7 +136,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + --max-running-requests "$MAX_RUNNING_REQUESTS" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & @@ -131,6 +156,7 @@ run_benchmark_serving \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ + --request-rate "$REQUEST_RATE" \ --result-filename "$RESULT_FILENAME" \ --result-dir "$PWD/" From 97a7e7d780b5a27023ae7db1821dda5a7fadf7dc Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 21:05:12 +0800 Subject: [PATCH 13/17] dsv4-b300-sglang: temporarily keep only conc=2048 8k1k for experiment Remove 1k1k and other 8k1k search-space entries so CI only runs the new conc=2048 recipe. Original configs noted in comments for restore. Co-Authored-By: Claude Opus 4.6 --- .github/configs/nvidia-master.yaml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7b31bfe29..332cb23d1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1854,18 +1854,13 @@ dsv4-fp4-b300-sglang: # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, # while low-latency leaves ep_size at the default of 1. seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } + # NOTE: 1k1k and other 8k1k configs temporarily removed for conc=2048 experiment. + # Restore after experiment: + # 1k1k: tp8/ep1/conc1, tp4/ep1/conc32, tp4/ep4/dpa/conc512 + # 8k1k: tp8/ep1/conc1, tp4/ep1/conc32, tp4/ep4/dpa/conc512 - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is From 628e47b1f796cbb8bb8e8e03c2c476d299d3bc85 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 21:10:38 +0800 Subject: [PATCH 14/17] Revert "dsv4-b300-sglang: temporarily keep only conc=2048 8k1k for experiment" This reverts commit 97a7e7d780b5a27023ae7db1821dda5a7fadf7dc. --- .github/configs/nvidia-master.yaml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 332cb23d1..7b31bfe29 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1854,13 +1854,18 @@ dsv4-fp4-b300-sglang: # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, # while low-latency leaves ep_size at the default of 1. seq-len-configs: - # NOTE: 1k1k and other 8k1k configs temporarily removed for conc=2048 experiment. - # Restore after experiment: - # 1k1k: tp8/ep1/conc1, tp4/ep1/conc32, tp4/ep4/dpa/conc512 - # 8k1k: tp8/ep1/conc1, tp4/ep1/conc32, tp4/ep4/dpa/conc512 + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - isl: 8192 osl: 1024 search-space: + - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is From 1526e9d8b04b9969ccb18ff12dd7c62d0127b320 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 21:11:11 +0800 Subject: [PATCH 15/17] Revert "dsv4-b300-sglang: add conc=2048 8k1k recipe with finite request-rate" This reverts commit cea70e55a839265e04009155e85a60ef5fe45e99. --- .github/configs/nvidia-master.yaml | 1 - benchmarks/benchmark_lib.sh | 7 +--- .../single_node/dsv4_fp4_b300_sglang.sh | 32 ++----------------- 3 files changed, 4 insertions(+), 36 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7b31bfe29..3a7ba3df1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1866,7 +1866,6 @@ dsv4-fp4-b300-sglang: - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 9845ee38c..268745735 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -205,7 +205,6 @@ run_benchmark_serving() { local use_chat_template=false local dsv4=false local trust_remote_code=false - local request_rate="inf" local server_pid="" while [[ $# -gt 0 ]]; do @@ -267,10 +266,6 @@ run_benchmark_serving() { trust_remote_code=true shift ;; - --request-rate) - request_rate="$2" - shift 2 - ;; --server-pid) server_pid="$2" shift 2 @@ -352,7 +347,7 @@ run_benchmark_serving() { --random-range-ratio "$random_range_ratio" --num-prompts "$num_prompts" --max-concurrency "$max_concurrency" - --request-rate "$request_rate" + --request-rate inf --ignore-eos "${profile_flag[@]}" --save-result diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 0d4940918..ac552c733 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -66,35 +66,10 @@ fi # single-instance uses flashinfer_mxfp4 with the cookbook defaults. DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' -# Default; the DP-attn branches below override per recipe. +# Default; the DP-attn branch below overrides to 0.94. MEM_FRACTION_STATIC=0.90 -MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" -REQUEST_RATE="inf" -if [ "${DP_ATTENTION}" = "true" ] && [ "$CONC" -ge 2048 ]; then - # Ultra-high-concurrency DP-attention recipe: TP=8, deepep mega_moe backend. - export SGLANG_LOG_FORWARD_ITERS=1 - export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 - export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 - export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 - export SGLANG_OPT_USE_FAST_MASK_EP=1 - export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 - export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=288 - export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 - PARALLEL_ARGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --cuda-graph-max-bs 288 - --deepep-config "$DEEPEP_CONFIG" - --chunked-prefill-size 65536 - --enable-prefill-delayer - ) - MEM_FRACTION_STATIC=0.87 - MAX_RUNNING_REQUESTS=2560 - REQUEST_RATE=16 -elif [ "${DP_ATTENTION}" = "true" ]; then +if [ "${DP_ATTENTION}" = "true" ]; then export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 @@ -136,7 +111,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$MAX_RUNNING_REQUESTS" \ + --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & @@ -156,7 +131,6 @@ run_benchmark_serving \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ - --request-rate "$REQUEST_RATE" \ --result-filename "$RESULT_FILENAME" \ --result-dir "$PWD/" From 14369b1e66848dca58ab927629f51a05e5d480f1 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 21:12:32 +0800 Subject: [PATCH 16/17] dsv4-b300-sglang-mtp: tune EAGLE spec params from (3,1,4) to (4,1,5) Co-Authored-By: Claude Opus 4.6 --- benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 767b9a8f9..d04661466 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -10,8 +10,8 @@ source "$(dirname "$0")/../benchmark_lib.sh" # true -> deepep + mega_moe + chunked-prefill 32768 # false -> flashinfer_mxfp4 + chunked-prefill 8192 # -# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3, -# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands. +# EAGLE/MTP speculative-decoding flags are hardcoded to (4, 1, 5): num-steps=4, +# eagle-topk=1, num-draft-tokens=5. Same chain across all CONC bands. check_env_vars \ MODEL \ TP \ @@ -69,9 +69,9 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96} # MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe. SPEC_FLAGS=( --speculative-algorithm EAGLE - --speculative-num-steps 3 + --speculative-num-steps 4 --speculative-eagle-topk 1 - --speculative-num-draft-tokens 4 + --speculative-num-draft-tokens 5 ) if [ "${DP_ATTENTION}" = "true" ]; then From 42b294d47a6a62aede23b68ab21bc66eeebfb775 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sun, 26 Apr 2026 21:20:46 +0800 Subject: [PATCH 17/17] Revert "dsv4-b300-sglang-mtp: tune EAGLE spec params from (3,1,4) to (4,1,5)" This reverts commit 14369b1e66848dca58ab927629f51a05e5d480f1. --- benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index d04661466..767b9a8f9 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -10,8 +10,8 @@ source "$(dirname "$0")/../benchmark_lib.sh" # true -> deepep + mega_moe + chunked-prefill 32768 # false -> flashinfer_mxfp4 + chunked-prefill 8192 # -# EAGLE/MTP speculative-decoding flags are hardcoded to (4, 1, 5): num-steps=4, -# eagle-topk=1, num-draft-tokens=5. Same chain across all CONC bands. +# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3, +# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands. check_env_vars \ MODEL \ TP \ @@ -69,9 +69,9 @@ DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96} # MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe. SPEC_FLAGS=( --speculative-algorithm EAGLE - --speculative-num-steps 4 + --speculative-num-steps 3 --speculative-eagle-topk 1 - --speculative-num-draft-tokens 5 + --speculative-num-draft-tokens 4 ) if [ "${DP_ATTENTION}" = "true" ]; then