From d2d42f80f5c4f64d5b9a3d427c87f482895760ea Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 29 Apr 2026 00:12:19 -0400 Subject: [PATCH 1/5] Add DSv4 FP8 H200 vLLM MTP benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror of dsv4-fp8-h200-vllm + --speculative-config '{"method":"mtp","num_speculative_tokens":2}', so we get an MTP counterpart of the existing H200 vLLM DeepSeek-V4-Pro recipe at https://vllm.ai/blog/deepseek-v4. - Image: vllm/vllm-openai:v0.20.0-cu130 (canonical v0.20.0; the non-MTP entry is still on the deepseekv4-cu129 tag). - Launch flags otherwise identical to dsv4_fp8_h200.sh: EP + DP=$TP, --gpu-memory-utilization 0.95, --max-num-seqs 512, --no-enable-flashinfer-autotune, FULL_DECODE_ONLY compile. - run_benchmark_serving uses --dsv4 per the AGENTS.md MTP rule — EAGLE-style spec decoding regresses acceptance on raw random tokens. - Search space mirrors the non-MTP H200 entry (TP=8, EP=8, DP-attn, CONC 4-64, both 1k1k and 8k1k) with spec-decoding: mtp. Adds a perf-changelog entry to trigger the new config. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 21 ++++ .../single_node/dsv4_fp8_h200_vllm_mtp.sh | 99 +++++++++++++++++++ perf-changelog.yaml | 9 ++ 3 files changed, 129 insertions(+) create mode 100755 benchmarks/single_node/dsv4_fp8_h200_vllm_mtp.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 7e975fdba..f161e9bdc 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2510,6 +2510,27 @@ dsv4-fp8-h200-vllm: search-space: - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64 } +# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.0-cu130 image +# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds +# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. +dsv4-fp8-h200-vllm-mtp: + image: vllm/vllm-openai:v0.20.0-cu130 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: h200 + precision: fp8 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 64, spec-decoding: mtp } + # DeepSeek-V4-Pro B300 single-node aggregate recipe from the submitted B300 # pareto sweep. The single-node schema has no explicit data-parallel-size # field, so dp-attn=true is used as the existing vLLM script switch for DP4 diff --git a/benchmarks/single_node/dsv4_fp8_h200_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_vllm_mtp.sh new file mode 100755 index 000000000..b6359c9d0 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp8_h200_vllm_mtp.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash + +# DeepSeek-V4-Pro H200 vLLM MTP variant of the recipe at +# https://vllm.ai/blog/deepseek-v4. Mirrors dsv4_fp8_h200.sh but adds +# --speculative-config '{"method":"mtp","num_speculative_tokens":2}' and +# routes prompts through chat-formatted encoding via --dsv4 (required for +# meaningful MTP acceptance numbers per AGENTS.md). + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +nvidia-smi + +hf download "$MODEL" + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +# DeepSeek-V4-Pro weights are large; engine startup can exceed the default +# 600s. Give it an hour to load. +export VLLM_ENGINE_READY_TIMEOUT_S=3600 + +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN" +else + MAX_MODEL_LEN_ARG="--max-model-len 800000" +fi + +# Start GPU monitoring (power, temperature, clocks every second) +start_gpu_monitor + +# Per the recipe, run with EP + DP=8 (no --tensor-parallel-size flag). TP +# from the search space is used only for GPU allocation by the runner and +# as the DP size. +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--trust-remote-code \ +--kv-cache-dtype fp8 \ +--block-size 256 \ +--no-enable-prefix-caching \ +--enable-expert-parallel \ +--data-parallel-size $TP \ +$MAX_MODEL_LEN_ARG \ +--gpu-memory-utilization 0.95 \ +--max-num-seqs 512 \ +--max-num-batched-tokens 512 \ +--no-enable-flashinfer-autotune \ +--compilation-config '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' \ +--speculative-config '{"method":"mtp","num_speculative_tokens":2}' \ +--tokenizer-mode deepseek_v4 \ +--tool-call-parser deepseek_v4 \ +--enable-auto-tool-choice \ +--reasoning-parser deepseek_v4 > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +# MTP acceptance rate degrades on raw random tokens; --dsv4 routes prompts +# through chat-formatted encoding as required for speculative decoding benchmarks. +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code \ + --dsv4 + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +# Stop GPU monitoring +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2bd14e776..2e64c37db 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1985,3 +1985,12 @@ - "Topology: 1 prefill DEP8 worker and 4 decode TP8 workers with dedicated NATS/etcd" - "Mirrors the historical 1P4D DEP8/TP8 offload point from srt-slurm aflowers/vllm-gb200-v0.20.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1218 + +- config-keys: + - dsv4-fp8-h200-vllm-mtp + description: + - "Add DeepSeek-V4-Pro FP8 H200 vLLM MTP variant (mirrors dsv4-fp8-h200-vllm with --speculative-config {\"method\":\"mtp\",\"num_speculative_tokens\":2})" + - "Image: vllm/vllm-openai:v0.20.0-cu130" + - "run_benchmark_serving uses --dsv4 (chat-formatted prompts) per the AGENTS.md MTP rule, since EAGLE-style speculative decoding regresses acceptance on raw random tokens" + - "Search space mirrors the non-MTP H200 entry: TP=8, EP=8, DP-attn=true, CONC 4-64 for both 1k1k and 8k1k, with spec-decoding: mtp" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From 112d0058932288326f5bb1f19839f6c7b293de37 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 29 Apr 2026 00:12:40 -0400 Subject: [PATCH 2/5] perf-changelog: fill in PR link for dsv4-fp8-h200-vllm-mtp Co-Authored-By: Claude Opus 4.7 (1M context) --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2e64c37db..4d5d71eb8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1993,4 +1993,4 @@ - "Image: vllm/vllm-openai:v0.20.0-cu130" - "run_benchmark_serving uses --dsv4 (chat-formatted prompts) per the AGENTS.md MTP rule, since EAGLE-style speculative decoding regresses acceptance on raw random tokens" - "Search space mirrors the non-MTP H200 entry: TP=8, EP=8, DP-attn=true, CONC 4-64 for both 1k1k and 8k1k, with spec-decoding: mtp" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1222 From 5461147cdec670cd09fd2f918555b7548fee5556 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 29 Apr 2026 00:20:44 -0400 Subject: [PATCH 3/5] dsv4-fp8-h200-vllm-mtp: rename script to match H200 runner convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The H200 runner (runners/launch_h200-cw.sh) constructs the script name as ${MODEL_CODE}_${PRECISION}_h200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh where FRAMEWORK_SUFFIX is empty for vllm — so it expects benchmarks/single_node/dsv4_fp8_h200_mtp.sh, not the framework-named dsv4_fp8_h200_vllm_mtp.sh. Run 12597 failed with "No such file or directory"; rename to fix it. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../{dsv4_fp8_h200_vllm_mtp.sh => dsv4_fp8_h200_mtp.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename benchmarks/single_node/{dsv4_fp8_h200_vllm_mtp.sh => dsv4_fp8_h200_mtp.sh} (100%) diff --git a/benchmarks/single_node/dsv4_fp8_h200_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh similarity index 100% rename from benchmarks/single_node/dsv4_fp8_h200_vllm_mtp.sh rename to benchmarks/single_node/dsv4_fp8_h200_mtp.sh From 4e6f92eb91b4f6d1a3e2efb439a2336401eeb447 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 29 Apr 2026 00:44:43 -0400 Subject: [PATCH 4/5] dsv4-fp8-h200-vllm-mtp: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0, num_speculative_tokens=1 - Export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 before vllm serve. The estimator overshoots H200 + MTP at memory-profile time and pushes us over budget even though actual cudagraph capture works fine. - Drop num_speculative_tokens from 2 to 1 for now; bring it back up once we have a stable baseline on this image. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 9 +++++++-- perf-changelog.yaml | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh index b6359c9d0..0c63fb7ae 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -2,7 +2,7 @@ # DeepSeek-V4-Pro H200 vLLM MTP variant of the recipe at # https://vllm.ai/blog/deepseek-v4. Mirrors dsv4_fp8_h200.sh but adds -# --speculative-config '{"method":"mtp","num_speculative_tokens":2}' and +# --speculative-config '{"method":"mtp","num_speculative_tokens":1}' and # routes prompts through chat-formatted encoding via --dsv4 (required for # meaningful MTP acceptance numbers per AGENTS.md). @@ -32,6 +32,11 @@ PORT=${PORT:-8888} # 600s. Give it an hour to load. export VLLM_ENGINE_READY_TIMEOUT_S=3600 +# Skip the cudagraph-memory estimator during the worker memory profiling +# phase — it overestimates and pushes us over the GPU memory budget on +# H200 + MTP, even though the actual cudagraph capture works fine. +export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 + if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN" @@ -59,7 +64,7 @@ $MAX_MODEL_LEN_ARG \ --max-num-batched-tokens 512 \ --no-enable-flashinfer-autotune \ --compilation-config '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' \ ---speculative-config '{"method":"mtp","num_speculative_tokens":2}' \ +--speculative-config '{"method":"mtp","num_speculative_tokens":1}' \ --tokenizer-mode deepseek_v4 \ --tool-call-parser deepseek_v4 \ --enable-auto-tool-choice \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4d5d71eb8..7009a1b8b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1989,8 +1989,9 @@ - config-keys: - dsv4-fp8-h200-vllm-mtp description: - - "Add DeepSeek-V4-Pro FP8 H200 vLLM MTP variant (mirrors dsv4-fp8-h200-vllm with --speculative-config {\"method\":\"mtp\",\"num_speculative_tokens\":2})" + - "Add DeepSeek-V4-Pro FP8 H200 vLLM MTP variant (mirrors dsv4-fp8-h200-vllm with --speculative-config {\"method\":\"mtp\",\"num_speculative_tokens\":1})" - "Image: vllm/vllm-openai:v0.20.0-cu130" + - "Set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 to skip the cudagraph-memory estimator (it overshoots the H200 + MTP memory budget at profile time even though actual cudagraph capture works fine)" - "run_benchmark_serving uses --dsv4 (chat-formatted prompts) per the AGENTS.md MTP rule, since EAGLE-style speculative decoding regresses acceptance on raw random tokens" - "Search space mirrors the non-MTP H200 entry: TP=8, EP=8, DP-attn=true, CONC 4-64 for both 1k1k and 8k1k, with spec-decoding: mtp" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1222 From e71f6f18acec5436e06fd0b0641cc616c9c3d83d Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 29 Apr 2026 01:08:33 -0400 Subject: [PATCH 5/5] dsv4-fp8-h200-vllm-mtp: use $MAX_MODEL_LEN from runner instead of hardcoded 800k Take the max-model-len from the runner-supplied MAX_MODEL_LEN env var (added to check_env_vars) so the value is set centrally per config instead of pinned in the script. Eval-only path is unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp8_h200_mtp.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh index 0c63fb7ae..5a6834757 100755 --- a/benchmarks/single_node/dsv4_fp8_h200_mtp.sh +++ b/benchmarks/single_node/dsv4_fp8_h200_mtp.sh @@ -14,6 +14,7 @@ check_env_vars \ CONC \ ISL \ OSL \ + MAX_MODEL_LEN \ RANDOM_RANGE_RATIO \ RESULT_FILENAME @@ -41,7 +42,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context MAX_MODEL_LEN_ARG="--max-model-len $EVAL_MAX_MODEL_LEN" else - MAX_MODEL_LEN_ARG="--max-model-len 800000" + MAX_MODEL_LEN_ARG="--max-model-len $MAX_MODEL_LEN" fi # Start GPU monitoring (power, temperature, clocks every second)