From ee44ab5d3ed018a15b9616430b4597b99f1550c6 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 26 Apr 2026 13:33:29 -0500 Subject: [PATCH 1/3] dsv4-fp4-b300-sglang-mtp: pass --dsv4 to use DSv4 chat template Routes benchmark prompts through encoding_dsv4.py (added in PR #1153) so DeepSeek-V4-Pro receives the ... framing it was trained against. PR #1166 had to drop --use-chat-template because the DSv4-Pro tokenizer has no jinja chat_template, which is exactly what --dsv4 sidesteps. Restores AGENTS.md compliance for MTP scripts (EAGLE acceptance rate silently regresses against raw random tokens). Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 9 ++++++++- perf-changelog.yaml | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 767b9a8f9..d01f80a1d 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -128,6 +128,12 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas +# --dsv4 routes prompts through encoding_dsv4.py (PR #1153), which emits the +# ... framing DeepSeek-V4-Pro expects. The DSv4-Pro +# tokenizer ships without a jinja chat_template, so plain --use-chat-template +# would crash; --dsv4 sidesteps that and satisfies the AGENTS.md rule that all +# MTP scripts must benchmark against chat-formatted inputs (EAGLE acceptance +# silently regresses on raw random tokens). run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ @@ -138,7 +144,8 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir "$PWD/" + --result-dir "$PWD/" \ + --dsv4 if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 300d39c40..40e900cb0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1903,3 +1903,11 @@ - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155 +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Pass --dsv4 (PR #1153) to run_benchmark_serving so prompts go through the DeepSeek-V4 chat template (encoding_dsv4.py)" + - "Restores AGENTS.md compliance: every MTP script must benchmark against chat-formatted inputs; PR #1166 had to drop --use-chat-template because the DSv4-Pro tokenizer has no jinja template, which is exactly what --dsv4 sidesteps" + - "EAGLE acceptance rate is sensitive to input distribution; raw random tokens silently regress acceptance vs. chat-framed prompts" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + From a351d197ccc68e72cc7871ddcf5765084189e3b9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 26 Apr 2026 13:33:55 -0500 Subject: [PATCH 2/3] perf-changelog: fill in PR #1182 link Co-Authored-By: Claude Opus 4.7 (1M context) --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 40e900cb0..1fb8fe39a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1909,5 +1909,5 @@ - "Pass --dsv4 (PR #1153) to run_benchmark_serving so prompts go through the DeepSeek-V4 chat template (encoding_dsv4.py)" - "Restores AGENTS.md compliance: every MTP script must benchmark against chat-formatted inputs; PR #1166 had to drop --use-chat-template because the DSv4-Pro tokenizer has no jinja template, which is exactly what --dsv4 sidesteps" - "EAGLE acceptance rate is sensitive to input distribution; raw random tokens silently regress acceptance vs. chat-framed prompts" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182 From 5cb6b2139377352018b90a3fd5f4f8cb66bc6eea Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 26 Apr 2026 13:35:24 -0500 Subject: [PATCH 3/3] perf-changelog: trim dsv4-mtp entry Co-Authored-By: Claude Opus 4.7 (1M context) --- perf-changelog.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1fb8fe39a..3cc64439b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1906,8 +1906,6 @@ - config-keys: - dsv4-fp4-b300-sglang-mtp description: - - "Pass --dsv4 (PR #1153) to run_benchmark_serving so prompts go through the DeepSeek-V4 chat template (encoding_dsv4.py)" - - "Restores AGENTS.md compliance: every MTP script must benchmark against chat-formatted inputs; PR #1166 had to drop --use-chat-template because the DSv4-Pro tokenizer has no jinja template, which is exactly what --dsv4 sidesteps" - - "EAGLE acceptance rate is sensitive to input distribution; raw random tokens silently regress acceptance vs. chat-framed prompts" + - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182