Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
logger = logging.get_logger(__name__)

# V5: Simplified mapping - single tokenizer class per model type (always prefer tokenizers-based)
REGISTERED_TOKENIZER_CLASSES: dict[str, type[Any]] = {}
REGISTERED_FAST_ALIASES: dict[str, type[Any]] = {}

TOKENIZER_MAPPING_NAMES = OrderedDict[str, str | None](
Expand Down Expand Up @@ -412,8 +411,10 @@ def tokenizer_class_from_name(class_name: str) -> type[Any] | None:
if class_name in REGISTERED_FAST_ALIASES:
return REGISTERED_FAST_ALIASES[class_name]

if class_name in REGISTERED_TOKENIZER_CLASSES:
return REGISTERED_TOKENIZER_CLASSES[class_name]
# User-registered classes take priority over built-ins
for tokenizer in TOKENIZER_MAPPING._extra_content.values():
if getattr(tokenizer, "__name__", None) == class_name:
return tokenizer

if class_name == "TokenizersBackend":
return TokenizersBackend
Expand All @@ -440,10 +441,6 @@ def tokenizer_class_from_name(class_name: str) -> type[Any] | None:
except AttributeError:
continue

for tokenizer in TOKENIZER_MAPPING._extra_content.values():
if getattr(tokenizer, "__name__", None) == class_name:
return tokenizer

# We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main

unrelated to this PR but might as well rm this duplicate comment 😅

# We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
# init and we return the proper dummy to get an appropriate error message.
Expand Down Expand Up @@ -858,10 +855,6 @@ def register(
else:
raise ValueError("You need to pass a `tokenizer_class`")

for candidate in (slow_tokenizer_class, fast_tokenizer_class, tokenizer_class):
if candidate is not None:
REGISTERED_TOKENIZER_CLASSES[candidate.__name__] = candidate

if slow_tokenizer_class is not None and fast_tokenizer_class is not None:
REGISTERED_FAST_ALIASES[slow_tokenizer_class.__name__] = fast_tokenizer_class

Expand Down
5 changes: 0 additions & 5 deletions tests/models/auto/test_tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
from transformers.models.auto.tokenization_auto import (
REGISTERED_FAST_ALIASES,
REGISTERED_TOKENIZER_CLASSES,
TOKENIZER_MAPPING,
TOKENIZER_MAPPING_NAMES,
get_tokenizer_config,
Expand Down Expand Up @@ -364,7 +363,6 @@ def test_new_tokenizer_registration(self):
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)

@require_tokenizers
def test_new_tokenizer_fast_registration(self):
Expand Down Expand Up @@ -409,8 +407,6 @@ def test_new_tokenizer_fast_registration(self):
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)
REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizerFast", None)
REGISTERED_FAST_ALIASES.pop("CustomTokenizer", None)

def test_from_pretrained_dynamic_tokenizer(self):
Expand Down Expand Up @@ -523,7 +519,6 @@ class NewTokenizer(BertTokenizer):
del CONFIG_MAPPING._extra_content["custom"]
if CustomConfig in TOKENIZER_MAPPING._extra_content:
del TOKENIZER_MAPPING._extra_content[CustomConfig]
REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)

def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
tokenizer = AutoTokenizer.from_pretrained(
Expand Down
Loading