From bd5737e0f8a83a89c103dd3a013fc3a451b7b0b2 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Tue, 7 Apr 2026 16:23:28 +0000 Subject: [PATCH] remove REGISTERED_TOKENIZER_CLASSES --- src/transformers/models/auto/tokenization_auto.py | 15 ++++----------- tests/models/auto/test_tokenization_auto.py | 5 ----- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 1b38f2e7a3f1..dbb8e4d95d97 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -58,7 +58,6 @@ logger = logging.get_logger(__name__) # V5: Simplified mapping - single tokenizer class per model type (always prefer tokenizers-based) -REGISTERED_TOKENIZER_CLASSES: dict[str, type[Any]] = {} REGISTERED_FAST_ALIASES: dict[str, type[Any]] = {} TOKENIZER_MAPPING_NAMES = OrderedDict[str, str | None]( @@ -412,8 +411,10 @@ def tokenizer_class_from_name(class_name: str) -> type[Any] | None: if class_name in REGISTERED_FAST_ALIASES: return REGISTERED_FAST_ALIASES[class_name] - if class_name in REGISTERED_TOKENIZER_CLASSES: - return REGISTERED_TOKENIZER_CLASSES[class_name] + # User-registered classes take priority over built-ins + for tokenizer in TOKENIZER_MAPPING._extra_content.values(): + if getattr(tokenizer, "__name__", None) == class_name: + return tokenizer if class_name == "TokenizersBackend": return TokenizersBackend @@ -440,10 +441,6 @@ def tokenizer_class_from_name(class_name: str) -> type[Any] | None: except AttributeError: continue - for tokenizer in TOKENIZER_MAPPING._extra_content.values(): - if getattr(tokenizer, "__name__", None) == class_name: - return tokenizer - # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main # init and we return the proper dummy to get an appropriate error message. @@ -858,10 +855,6 @@ def register( else: raise ValueError("You need to pass a `tokenizer_class`") - for candidate in (slow_tokenizer_class, fast_tokenizer_class, tokenizer_class): - if candidate is not None: - REGISTERED_TOKENIZER_CLASSES[candidate.__name__] = candidate - if slow_tokenizer_class is not None and fast_tokenizer_class is not None: REGISTERED_FAST_ALIASES[slow_tokenizer_class.__name__] = fast_tokenizer_class diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py index 2bc79a3f82d6..3c0150fca28f 100644 --- a/tests/models/auto/test_tokenization_auto.py +++ b/tests/models/auto/test_tokenization_auto.py @@ -45,7 +45,6 @@ from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig from transformers.models.auto.tokenization_auto import ( REGISTERED_FAST_ALIASES, - REGISTERED_TOKENIZER_CLASSES, TOKENIZER_MAPPING, TOKENIZER_MAPPING_NAMES, get_tokenizer_config, @@ -364,7 +363,6 @@ def test_new_tokenizer_registration(self): del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[CustomConfig] - REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None) @require_tokenizers def test_new_tokenizer_fast_registration(self): @@ -409,8 +407,6 @@ def test_new_tokenizer_fast_registration(self): del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[CustomConfig] - REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None) - REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizerFast", None) REGISTERED_FAST_ALIASES.pop("CustomTokenizer", None) def test_from_pretrained_dynamic_tokenizer(self): @@ -523,7 +519,6 @@ class NewTokenizer(BertTokenizer): del CONFIG_MAPPING._extra_content["custom"] if CustomConfig in TOKENIZER_MAPPING._extra_content: del TOKENIZER_MAPPING._extra_content[CustomConfig] - REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None) def test_from_pretrained_dynamic_tokenizer_legacy_format(self): tokenizer = AutoTokenizer.from_pretrained(