SemiAnalysisAI · yhyang201 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026
@@ -1867,6 +1867,36 @@ dsv4-fp4-b300-sglang:
     - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
 
+# DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
+# selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
+# DP_ATTENTION:
+#   dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192
+#   dp-attn: true  -> DP-attn + deepep mega_moe + chunked-prefill 32768
+# `ep` is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
+# while the TP-only path leaves ep_size at the default of 1.
+dsv4-fp4-b300-sglang-mtp:
+  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  # Three CONC bands sweep with EAGLE/MTP (3/1/4) on top:
+  #   A: TP=8 ep=1            -- conc 1-8    (latency-bound, full TP)
+  #   B: TP=4 ep=1            -- conc 16-128 (TP-only, mid batch)
+  #   C: TP=4 ep=4 dp-attn    -- conc 64-512 (DP-attn + EP, large batch)
+  # Overlap: B/C at conc 64,128 (TP-only vs DP-attn EP head-to-head).
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
+
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
   model: Qwen/Qwen3.5-397B-A17B

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+# Tuning inputs from the matrix (all required):
+#   TP            -- tensor parallel size                       -> --tp
+#   EP_SIZE       -- expert parallel size                       -> --ep-size
+#   DP_ATTENTION  -- "true" enables --enable-dp-attention --dp-size $TP
+#                    Also selects MoE backend / chunked-prefill-size:
+#                      true  -> deepep + mega_moe + chunked-prefill 32768
+#                      false -> flashinfer_mxfp4  + chunked-prefill 8192
+#
+# EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3,
+# eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands.
+check_env_vars \
+    MODEL \
+    TP \
+    EP_SIZE \
+    DP_ATTENTION \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
+# `hf download`. Only fetch when MODEL looks like a HF repo ID.
+if [[ "$MODEL" != /* ]]; then
+    hf download "$MODEL"
+fi
+
+nvidia-smi
+
+# Common SGLANG env vars (apply to every config).
+export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
+export SGLANG_OPT_USE_JIT_NORM=1
+export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
+export SGLANG_OPT_USE_TOPK_V2=1
+export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
+
+# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
+# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
+# The runner mounts our repo at a non-/workspace path for these images so the
+# editable install stays visible. Paths in this script are $PWD-relative for
+# that reason. Drop the runner conditional once lmsys moves sglang back out of
+# /workspace.
+
+SERVER_LOG="$PWD/server.log"
+PORT=${PORT:-8888}
+
+echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
+
+# Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow.
+DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+# MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe.
+SPEC_FLAGS=(
+    --speculative-algorithm EAGLE
+    --speculative-num-steps 3
+    --speculative-eagle-topk 1
+    --speculative-num-draft-tokens 4
+)
+
+if [ "${DP_ATTENTION}" = "true" ]; then
+    # Large-batch EP path: deepep + mega_moe.
+    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
+    export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
+    export SGLANG_OPT_USE_FAST_MASK_EP=1
+    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
+    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
+    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
+    PARALLEL_ARGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+    )
+    CHUNKED_PREFILL_SIZE=32768
+else
+    # Small-batch TP-only path: flashinfer_mxfp4.
+    PARALLEL_ARGS=(
+        --moe-runner-backend flashinfer_mxfp4
+        --disable-flashinfer-autotune
+    )
+    CHUNKED_PREFILL_SIZE=8192
+fi
+
+# Print all SGLANG_* env vars to both the CI step log and server.log so the
+# launch config is auditable from the result artifact alone.
+{
+    echo "=== SGLANG_* env vars at launch ==="
+    env | grep -E '^SGLANG_' | sort
+    echo "==================================="
+} | tee "$SERVER_LOG"
+
+set -x
+PYTHONNOUSERSITE=1 sglang serve \
+    --model-path $MODEL \
+    --host 0.0.0.0 \
+    --port $PORT \
+    --trust-remote-code \
+    --tp $TP \
+    --ep-size $EP_SIZE \
+    --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
+    --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
+    --mem-fraction-static 0.90 \
+    --swa-full-tokens-ratio 0.1 \
+    "${SPEC_FLAGS[@]}" \
+    "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $((CONC * 10)) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir "$PWD/"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1875,3 +1875,15 @@
     - "better performance for dp-attention"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174
+
+- config-keys:
+    - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark with EAGLE/MTP speculative decoding"
+    - "Image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 (pinned for deep_gemm transform_weights_for_mega_moe support; same digest as PR #1158)"
+    - "Model: deepseek-ai/DeepSeek-V4-Pro"
+    - "EAGLE/MTP flags hardcoded in script: num-steps=3, eagle-topk=1, num-draft-tokens=4"
+    - "Recipe (MoE backend, chunked-prefill) selected in script by dp-attn: TP-only + flashinfer_mxfp4 (small batch) vs DP-attn + deepep mega_moe (large batch)"
+    - "Three CONC bands: A=TP8 (1-8), B=TP4 (16-128), C=DP4 dp-attn (64-512); B/C overlap at conc 64,128"
+    - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166