From ee44ab5d3ed018a15b9616430b4597b99f1550c6 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Sun, 26 Apr 2026 13:33:29 -0500
Subject: [PATCH 1/3] dsv4-fp4-b300-sglang-mtp: pass --dsv4 to use DSv4 chat
 template

Routes benchmark prompts through encoding_dsv4.py (added in PR #1153)
so DeepSeek-V4-Pro receives the <bos><User>...<Assistant><think>
framing it was trained against. PR #1166 had to drop --use-chat-template
because the DSv4-Pro tokenizer has no jinja chat_template, which is
exactly what --dsv4 sidesteps.

Restores AGENTS.md compliance for MTP scripts (EAGLE acceptance rate
silently regresses against raw random tokens).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh | 9 ++++++++-
 perf-changelog.yaml                                | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 767b9a8f9..d01f80a1d 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -128,6 +128,12 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 
 pip install -q datasets pandas
 
+# --dsv4 routes prompts through encoding_dsv4.py (PR #1153), which emits the
+# <bos><User>...<Assistant><think> framing DeepSeek-V4-Pro expects. The DSv4-Pro
+# tokenizer ships without a jinja chat_template, so plain --use-chat-template
+# would crash; --dsv4 sidesteps that and satisfies the AGENTS.md rule that all
+# MTP scripts must benchmark against chat-formatted inputs (EAGLE acceptance
+# silently regresses on raw random tokens).
 run_benchmark_serving \
     --model "$MODEL" \
     --port "$PORT" \
@@ -138,7 +144,8 @@ run_benchmark_serving \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir "$PWD/"
+    --result-dir "$PWD/" \
+    --dsv4
 
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 300d39c40..40e900cb0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1903,3 +1903,11 @@
     - "ISL=8192: TP4 conc 4-64; DP4 (dp-attn) conc 128-1024; DP8 (dp-attn) conc 1024-8192"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1155
 
+- config-keys:
+    - dsv4-fp4-b300-sglang-mtp
+  description:
+    - "Pass --dsv4 (PR #1153) to run_benchmark_serving so prompts go through the DeepSeek-V4 chat template (encoding_dsv4.py)"
+    - "Restores AGENTS.md compliance: every MTP script must benchmark against chat-formatted inputs; PR #1166 had to drop --use-chat-template because the DSv4-Pro tokenizer has no jinja template, which is exactly what --dsv4 sidesteps"
+    - "EAGLE acceptance rate is sensitive to input distribution; raw random tokens silently regress acceptance vs. chat-framed prompts"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+

From a351d197ccc68e72cc7871ddcf5765084189e3b9 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Sun, 26 Apr 2026 13:33:55 -0500
Subject: [PATCH 2/3] perf-changelog: fill in PR #1182 link

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 40e900cb0..1fb8fe39a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1909,5 +1909,5 @@
     - "Pass --dsv4 (PR #1153) to run_benchmark_serving so prompts go through the DeepSeek-V4 chat template (encoding_dsv4.py)"
     - "Restores AGENTS.md compliance: every MTP script must benchmark against chat-formatted inputs; PR #1166 had to drop --use-chat-template because the DSv4-Pro tokenizer has no jinja template, which is exactly what --dsv4 sidesteps"
     - "EAGLE acceptance rate is sensitive to input distribution; raw random tokens silently regress acceptance vs. chat-framed prompts"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182
 

From 5cb6b2139377352018b90a3fd5f4f8cb66bc6eea Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Sun, 26 Apr 2026 13:35:24 -0500
Subject: [PATCH 3/3] perf-changelog: trim dsv4-mtp entry

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 perf-changelog.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1fb8fe39a..3cc64439b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1906,8 +1906,6 @@
 - config-keys:
     - dsv4-fp4-b300-sglang-mtp
   description:
-    - "Pass --dsv4 (PR #1153) to run_benchmark_serving so prompts go through the DeepSeek-V4 chat template (encoding_dsv4.py)"
-    - "Restores AGENTS.md compliance: every MTP script must benchmark against chat-formatted inputs; PR #1166 had to drop --use-chat-template because the DSv4-Pro tokenizer has no jinja template, which is exactly what --dsv4 sidesteps"
-    - "EAGLE acceptance rate is sensitive to input distribution; raw random tokens silently regress acceptance vs. chat-framed prompts"
+    - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182