Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ wait_for_server_ready() {
}

# Run benchmark serving with standardized parameters
# All parameters are required except --use-chat-template and --trust-remote-code
# All parameters are required except --use-chat-template, --dsv4, and --trust-remote-code
# Parameters:
# --model: Model name
# --port: Server port
Expand All @@ -178,6 +178,9 @@ wait_for_server_ready() {
# --result-filename: Result filename without extension
# --result-dir: Result directory
# --use-chat-template: Optional flag to enable chat template
# --dsv4: Optional flag to use the DeepSeek-V4 chat template
# (encoding_dsv4.py) instead of the tokenizer's built-in jinja
# template. Implies --use-chat-template.
# --trust-remote-code: Optional flag to trust remote code from HuggingFace
# --server-pid: Optional server process ID to monitor during benchmark
run_benchmark_serving() {
Expand All @@ -200,6 +203,7 @@ run_benchmark_serving() {
local result_dir=""
local workspace_dir=""
local use_chat_template=false
local dsv4=false
local trust_remote_code=false
local server_pid=""

Expand Down Expand Up @@ -253,6 +257,11 @@ run_benchmark_serving() {
use_chat_template=true
shift
;;
--dsv4)
dsv4=true
use_chat_template=true
shift
;;
--trust-remote-code)
trust_remote_code=true
shift
Expand Down Expand Up @@ -353,6 +362,12 @@ run_benchmark_serving() {
benchmark_cmd+=(--use-chat-template)
fi

# Add --dsv4 if requested (requires --use-chat-template, which we
# auto-enable when --dsv4 is passed in).
if [[ "$dsv4" == true ]]; then
benchmark_cmd+=(--dsv4)
fi

# Add --trust-remote-code if requested
if [[ "$trust_remote_code" == true ]]; then
benchmark_cmd+=(--trust-remote-code)
Expand Down
56 changes: 40 additions & 16 deletions utils/bench_serving/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from argparse import ArgumentParser as FlexibleArgumentParser

from benchmark_utils import convert_to_pytorch_benchmark_format
from encoding_dsv4 import encode_messages as dsv4_encode_messages

MILLISECONDS_TO_SECONDS_CONVERSION = 1000

Expand Down Expand Up @@ -104,10 +105,30 @@ def _init_tokenizer_worker(tokenizer_id, tokenizer_mode, trust_remote_code):
)


def _apply_chat_template(prompt, tokenizer, dsv4):
"""Render a single user message into the appropriate chat-template prompt.

When `dsv4` is True we use the self-contained DeepSeek-V4 encoder
(encoding_dsv4.encode_messages) which emits the
<bos><User>...<Assistant><think> framing the model expects. Otherwise we
fall back to the tokenizer's built-in jinja chat template.
"""
if dsv4:
return dsv4_encode_messages(
[{"role": "user", "content": prompt}],
thinking_mode="thinking",
)
return tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)


def _process_prompt_chunk(chunk_args):
"""Generate a chunk of random prompts in a worker process."""
(indices, prefix_token_ids, input_lens, output_lens, offsets,
prefix_len, vocab_size, use_chat_template, seed) = chunk_args
prefix_len, vocab_size, use_chat_template, dsv4, seed) = chunk_args

rng = np.random.RandomState(seed)
tokenizer = _worker_tokenizer
Expand Down Expand Up @@ -135,11 +156,7 @@ def _process_prompt_chunk(chunk_args):
prompt = tokenizer.decode(prompt_token_ids)

if use_chat_template:
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt = _apply_chat_template(prompt, tokenizer, dsv4)

prompt_len = len(tokenizer.encode(prompt, add_special_tokens=False))
mismatch = prompt_len - tgt_prompt_len
Expand All @@ -156,6 +173,7 @@ def sample_random_requests(
range_ratio: float,
tokenizer: PreTrainedTokenizerBase,
use_chat_template: bool = False,
dsv4: bool = False,
tokenizer_id: Optional[str] = None,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
Expand All @@ -164,12 +182,11 @@ def sample_random_requests(
vocab_size = tokenizer.vocab_size
prefix_token_ids = np.random.randint(0, vocab_size, size=prefix_len).tolist()

if dsv4 and not use_chat_template:
raise ValueError("--dsv4 requires --use-chat-template to be set.")

if use_chat_template:
chat_template_dummy = tokenizer.apply_chat_template(
[{"role": "user", "content": "a"}],
add_generation_prompt=True,
tokenize=False,
)
chat_template_dummy = _apply_chat_template("a", tokenizer, dsv4)
tokenized_chat_template_dummy = tokenizer.encode(chat_template_dummy, add_special_tokens=False)
chat_template_len = len(tokenized_chat_template_dummy) - 1
input_len = input_len - chat_template_len
Expand Down Expand Up @@ -215,6 +232,7 @@ def sample_uniform(seq_len):
prefix_len,
vocab_size,
use_chat_template,
dsv4,
int(local_rng.randint(0, 2**31)),
))

Expand Down Expand Up @@ -261,11 +279,7 @@ def sample_uniform(seq_len):
prompt = tokenizer.decode(prompt_token_ids)

if use_chat_template:
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt = _apply_chat_template(prompt, tokenizer, dsv4)

prompt_len = len(tokenizer.encode(prompt, add_special_tokens=False))
mismatches.append(prompt_len - tgt_prompt_len)
Expand Down Expand Up @@ -783,6 +797,7 @@ def main(args: argparse.Namespace):
range_ratio=args.random_range_ratio,
tokenizer=tokenizer,
use_chat_template=args.use_chat_template,
dsv4=args.dsv4,
tokenizer_id=tokenizer_id,
tokenizer_mode=tokenizer_mode,
trust_remote_code=args.trust_remote_code,
Expand Down Expand Up @@ -1156,6 +1171,15 @@ def main(args: argparse.Namespace):
"0 (default) = auto (min(cpu_count, 8)). 1 = serial (no multiprocessing).",
)

dsv4_group = parser.add_argument_group("DeepSeek-V4 chat template options")
dsv4_group.add_argument(
"--dsv4",
action="store_true",
help="Use the DeepSeek-V4 chat template (encoding_dsv4.py) instead of "
"the tokenizer's built-in jinja chat template. Requires "
"--use-chat-template to also be set. Applies to the random dataset.",
)

hf_group = parser.add_argument_group("hf dataset options")
hf_group.add_argument("--hf-subset",
type=str,
Expand Down
Loading
Loading