From d202aca13bb4fae80e18fa19cf17f3b297d1d0a0 Mon Sep 17 00:00:00 2001
From: Mohd Faour <mohdfaour03@example.com>
Date: Wed, 8 Apr 2026 16:26:37 +0300
Subject: [PATCH 1/2] Fix AttributeError in _patch_mistral_regex by removing
 .backend_tokenizer

---
 src/transformers/tokenization_utils_tokenizers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/tokenization_utils_tokenizers.py b/src/transformers/tokenization_utils_tokenizers.py
index b516a777ecf1..fcd82078295e 100644
--- a/src/transformers/tokenization_utils_tokenizers.py
+++ b/src/transformers/tokenization_utils_tokenizers.py
@@ -1360,11 +1360,11 @@ def is_base_mistral(model_id: str) -> bool:
                         ),
                         behavior="isolated",
                     )
-                    current_pretokenizer = tokenizer.backend_tokenizer.pre_tokenizer
+                    current_pretokenizer = tokenizer.pre_tokenizer
                     # Check if it's already a Sequence
                     if isinstance(current_pretokenizer, tokenizers.pre_tokenizers.Sequence):
                         # Replace the first element (the Split pattern)
-                        tokenizer.backend_tokenizer.pre_tokenizer[0] = split_pretokenizer
+                        tokenizer.pre_tokenizer[0] = split_pretokenizer
                     else:
                         # Replace Metaspace with ByteLevel when adding Split, as Metaspace(split=False) doesn't
                         # work correctly with the Split pre-tokenizer and causes spaces to be lost during encoding
@@ -1374,7 +1374,7 @@ def is_base_mistral(model_id: str) -> bool:
                             )
 
                         # Not a Sequence, so create one with Split + current pretokenizer
-                        tokenizer.backend_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence(
+                        tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence(
                             [
                                 split_pretokenizer,
                                 current_pretokenizer,

From 93b05c044d61a181bc8a6a1b63fa18f01debed33 Mon Sep 17 00:00:00 2001
From: Mohd Faour <mohdfaour03@example.com>
Date: Thu, 9 Apr 2026 15:27:38 +0300
Subject: [PATCH 2/2] Add regression test for fix_mistral_regex=True patching
 code path

The existing test only checks that passing fix_mistral_regex=True doesn't
error, but the hub model's config version causes early return so the
patching logic is never exercised. This new test creates a local config
with an old transformers_version to force the patching code path, verifying
that the pre_tokenizer is correctly patched to a Sequence without
AttributeError.
---
 tests/models/auto/test_tokenization_auto.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
index 2bc79a3f82d6..d2514580e107 100644
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@@ -306,6 +306,27 @@ def test_auto_tokenizer_from_mistral_patching(self):
             "mistralai/Ministral-3-3B-Instruct-2512", fix_mistral_regex=True
         )  # should not error
 
+    @require_tokenizers
+    def test_auto_tokenizer_mistral_patching_applies_pretokenizer(self):
+        """Verify fix_mistral_regex=True actually patches the pre_tokenizer without AttributeError."""
+        import tokenizers
+
+        tokenizer = AutoTokenizer.from_pretrained("mistralai/Ministral-3-3B-Instruct-2512")
+        # Create a temp config with an old transformers_version so the patching code path is exercised
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config_path = os.path.join(tmp_dir, "config.json")
+            with open(config_path, "w", encoding="utf-8") as f:
+                json.dump({"model_type": "mistral", "transformers_version": "4.50.0"}, f)
+
+            patched = TokenizersBackend._patch_mistral_regex(
+                tokenizer._tokenizer,
+                tmp_dir,
+                is_local=True,
+                fix_mistral_regex=True,
+            )
+        self.assertTrue(getattr(patched, "fix_mistral_regex", False))
+        self.assertIsInstance(patched.pre_tokenizer, tokenizers.pre_tokenizers.Sequence)
+
     @require_tokenizers
     def test_auto_tokenizer_loads_bloom_repo_without_tokenizer_class(self):
         tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-BloomForCausalLM")