From 4de7ce5298d282f81ea89a865873cd857740ffdf Mon Sep 17 00:00:00 2001 From: Liu Date: Fri, 1 May 2026 00:11:28 +0100 Subject: [PATCH] fix(bench_serving): replace vllm get_tokenizer with direct transformers AutoTokenizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem benchmark_serving.py imported get_tokenizer from vllm: try: from vllm.transformers_utils.tokenizer import get_tokenizer except ImportError: from backend_request_func import get_tokenizer vllm's get_cached_tokenizer() accesses tokenizer.all_special_tokens_extended, which does not exist on Rust-backed TokenizersBackend (e.g. GLM-5 / ZhipuAI models). There is no Python slow tokenizer fallback for these models — use_fast=False still returns TokenizersBackend — so --tokenizer-mode slow is also insufficient. AttributeError: TokenizersBackend has no attribute all_special_tokens_extended The crash occurs in the benchmark client, not in the SGLang server. ## Fix Replace the vllm import with a self-contained get_tokenizer() backed by transformers.AutoTokenizer.from_pretrained(). This avoids vllm's get_cached_tokenizer() entirely while maintaining full API compatibility with all three call sites in benchmark_serving.py. ## Verification Confirmed fix resolves the crash on: - lmsysorg/sglang-rocm:v0.5.10rc0-rocm700-mi35x-20260422 (vllm 0.9.2rc2) - lmsysorg/sglang-rocm:v0.5.10.post1-rocm700-mi35x-20260428 with model amd/GLM-5-MXFP4 (TP=8, MI355X, EAGLE MTP speculative decoding). Co-Authored-By: Claude Sonnet 4.6 --- utils/bench_serving/benchmark_serving.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 68887c59b..5f1672789 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -45,10 +45,20 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -try: - from vllm.transformers_utils.tokenizer import get_tokenizer -except ImportError: - from backend_request_func import get_tokenizer +def get_tokenizer(tokenizer_id, tokenizer_mode="auto", trust_remote_code=False, **kwargs): + """Load tokenizer directly via transformers, bypassing vllm's get_cached_tokenizer. + + vllm's get_cached_tokenizer() accesses tokenizer.all_special_tokens_extended which + does not exist on Rust-backed TokenizersBackend (e.g. GLM-5). Using transformers + AutoTokenizer directly avoids that code path entirely. + """ + from transformers import AutoTokenizer + use_fast = tokenizer_mode != "slow" + return AutoTokenizer.from_pretrained( + tokenizer_id, + use_fast=use_fast, + trust_remote_code=trust_remote_code, + ) try: from vllm.utils import FlexibleArgumentParser