diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7e975fdba..f161e9bdc 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2510,6 +2510,27 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } +# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.0-cu130 image +# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds +# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. +dsv4-fp8-h200-vllm-mtp: + image: vllm/vllm-openai:v0.20.0-cu130 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: h200 + precision: fp8 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp } + # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 # pareto sweep. The single-node schema has no explicit data-parallel-size # field, so dp-attn=true is used as the existing vLLM script switch for DP4 diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh new file mode 100755 index 000000000..5a6834757 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash + +# DeepSeek-V4-Pro H200 vLLM MTP variant of the recipe at +# https://vllm.ai/blog/deepseek-v4. Mirrors dsv4_fp8_h200.sh but adds +# --speculative-config '{"method":"mtp","num_speculative_tokens":1}' and +# routes prompts through chat-formatted encoding via --dsv4 (required for +# meaningful MTP acceptance numbers per AGENTS.md). + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +# DeepSeek-V4-Pro weights are large; engine startup can exceed the default +# 600s. Give it an hour to load. +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +# Skip the cudagraph-memory estimator during the worker memory profiling +# phase — it overestimates and pushes us over the GPU memory budget on +# H200 + MTP, even though the actual cudagraph capture works fine. +export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN" +else + MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN" +fi + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP +# from the search space is used only for GPU allocation by the runner and +# as the DP size. +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--trust-remote-code \ +--kv-cache-dtype fp8 \ +--block-size 256 \ +--no-enable-prefix-caching \ +--enable-expert-parallel \ +--data-parallel-size $TP \ +$MAX_MODEL_LEN_ARG \ +--gpu-memory-utilization 0.95 \ +--max-num-seqs 512 \ +--max-num-batched-tokens 512 \ +--no-enable-flashinfer-autotune \ +--compilation-config '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' \ +--speculative-config '{"method":"mtp","num_speculative_tokens":1}' \ +--tokenizer-mode deepseek_v4 \ +--tool-call-parser deepseek_v4 \ +--enable-auto-tool-choice \ +--reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +# MTP acceptance rate degrades on raw random tokens; --dsv4 routes prompts +# through chat-formatted encoding as required for speculative decoding benchmarks. +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code \ + --dsv4 + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2bd14e776..7009a1b8b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1985,3 +1985,13 @@ - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd" - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 + +- config-keys: + - dsv4-fp8-h200-vllm-mtp + description: + - "Add DeepSeek-V4-Pro FP8 H200 vLLM MTP variant (mirrors dsv4-fp8-h200-vllm with --speculative-config {\"method\":\"mtp\",\"num_speculative_tokens\":1})" + - "Image: vllm/vllm-openai:v0.20.0-cu130" + - "Set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 to skip the cudagraph-memory estimator (it overshoots the H200 + MTP memory budget at profile time even though actual cudagraph capture works fine)" + - "run_benchmark_serving uses --dsv4 (chat-formatted prompts) per the AGENTS.md MTP rule, since EAGLE-style speculative decoding regresses acceptance on raw random tokens" + - "Search space mirrors the non-MTP H200 entry: TP=8, EP=8, DP-attn=true, CONC 4-64 for both 1k1k and 8k1k, with spec-decoding: mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1222