diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 767b9a8f9..d01f80a1d 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -128,6 +128,12 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S pip install -q datasets pandas +# --dsv4 routes prompts through encoding_dsv4.py (PR #1153), which emits the +# ... framing DeepSeek-V4-Pro expects. The DSv4-Pro +# tokenizer ships without a jinja chat_template, so plain --use-chat-template +# would crash; --dsv4 sidesteps that and satisfies the AGENTS.md rule that all +# MTP scripts must benchmark against chat-formatted inputs (EAGLE acceptance +# silently regresses on raw random tokens). run_benchmark_serving \ --model "$MODEL" \ --port "$PORT" \ @@ -138,7 +144,8 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir "$PWD/" + --result-dir "$PWD/" \ + --dsv4 if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 992c64ecb..2bd2f025c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1886,3 +1886,9 @@ - "Image pinned to lmsysorg/sglang:deepseek-v4-blackwell@sha256:df18bfc4aa9ecf59451002b49ba00cae58042de9e2a96378bbd21b404dd62c7b" - "Adds SGLANG_OPT_* env knobs (SWA_SPLIT_LEAF_ON_INSERT, USE_JIT_NORM, USE_JIT_INDEXER_METADATA, USE_TOPK_V2, USE_CUSTOM_ALL_REDUCE_V2)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1187 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Pass --dsv4 to run_benchmark_serving so MTP benchmarks use the DSv4 chat template (PR #1153)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1182