From d202aca13bb4fae80e18fa19cf17f3b297d1d0a0 Mon Sep 17 00:00:00 2001 From: Mohd Faour Date: Wed, 8 Apr 2026 16:26:37 +0300 Subject: [PATCH 1/2] Fix AttributeError in _patch_mistral_regex by removing .backend_tokenizer --- src/transformers/tokenization_utils_tokenizers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/tokenization_utils_tokenizers.py b/src/transformers/tokenization_utils_tokenizers.py index b516a777ecf1..fcd82078295e 100644 --- a/src/transformers/tokenization_utils_tokenizers.py +++ b/src/transformers/tokenization_utils_tokenizers.py @@ -1360,11 +1360,11 @@ def is_base_mistral(model_id: str) -> bool: ), behavior="isolated", ) - current_pretokenizer = tokenizer.backend_tokenizer.pre_tokenizer + current_pretokenizer = tokenizer.pre_tokenizer # Check if it's already a Sequence if isinstance(current_pretokenizer, tokenizers.pre_tokenizers.Sequence): # Replace the first element (the Split pattern) - tokenizer.backend_tokenizer.pre_tokenizer[0] = split_pretokenizer + tokenizer.pre_tokenizer[0] = split_pretokenizer else: # Replace Metaspace with ByteLevel when adding Split, as Metaspace(split=False) doesn't # work correctly with the Split pre-tokenizer and causes spaces to be lost during encoding @@ -1374,7 +1374,7 @@ def is_base_mistral(model_id: str) -> bool: ) # Not a Sequence, so create one with Split + current pretokenizer - tokenizer.backend_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence( + tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence( [ split_pretokenizer, current_pretokenizer, From 93b05c044d61a181bc8a6a1b63fa18f01debed33 Mon Sep 17 00:00:00 2001 From: Mohd Faour Date: Thu, 9 Apr 2026 15:27:38 +0300 Subject: [PATCH 2/2] Add regression test for fix_mistral_regex=True patching code path The existing test only checks that passing fix_mistral_regex=True doesn't error, but the hub model's config version causes early return so the patching logic is never exercised. This new test creates a local config with an old transformers_version to force the patching code path, verifying that the pre_tokenizer is correctly patched to a Sequence without AttributeError. --- tests/models/auto/test_tokenization_auto.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py index 2bc79a3f82d6..d2514580e107 100644 --- a/tests/models/auto/test_tokenization_auto.py +++ b/tests/models/auto/test_tokenization_auto.py @@ -306,6 +306,27 @@ def test_auto_tokenizer_from_mistral_patching(self): "mistralai/Ministral-3-3B-Instruct-2512", fix_mistral_regex=True ) # should not error + @require_tokenizers + def test_auto_tokenizer_mistral_patching_applies_pretokenizer(self): + """Verify fix_mistral_regex=True actually patches the pre_tokenizer without AttributeError.""" + import tokenizers + + tokenizer = AutoTokenizer.from_pretrained("mistralai/Ministral-3-3B-Instruct-2512") + # Create a temp config with an old transformers_version so the patching code path is exercised + with tempfile.TemporaryDirectory() as tmp_dir: + config_path = os.path.join(tmp_dir, "config.json") + with open(config_path, "w", encoding="utf-8") as f: + json.dump({"model_type": "mistral", "transformers_version": "4.50.0"}, f) + + patched = TokenizersBackend._patch_mistral_regex( + tokenizer._tokenizer, + tmp_dir, + is_local=True, + fix_mistral_regex=True, + ) + self.assertTrue(getattr(patched, "fix_mistral_regex", False)) + self.assertIsInstance(patched.pre_tokenizer, tokenizers.pre_tokenizers.Sequence) + @require_tokenizers def test_auto_tokenizer_loads_bloom_repo_without_tokenizer_class(self): tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-BloomForCausalLM")