From 78c2a750d3e91ac6d57a2d032d870bc830c55fc8 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 7 Sep 2023 22:56:55 +0000 Subject: [PATCH 1/5] fix `set_infilling_processor` to properly reset --- .../models/code_llama/tokenization_code_llama_fast.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py index d3fc6e8abb99..406ae2cef92c 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py +++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py @@ -264,6 +264,7 @@ def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens= ] ) self.update_post_processor() + return self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="▁") pair = [self.bos_token] if self.add_bos_token and add_special_tokens else [] From 6ad456769aa0e90043f62b99d3075946aec6a812 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Fri, 8 Sep 2023 14:07:20 +0000 Subject: [PATCH 2/5] Add docstring! --- .../models/code_llama/tokenization_code_llama_fast.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py index 406ae2cef92c..6e32783dec8d 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py +++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py @@ -256,6 +256,15 @@ def add_bos_token(self, value): self.update_post_processor() def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True): + """ + Updates the normalizer to make sure the prompt format for `infilling` is respected. + If `reset` is set to `True`, the normalizer is reset to the normal behaviour. + The infilling format is the following: + if suffix_first + "
 {suf}  {pre}"
+        else:
+            " 
 {pre} {suf} "
+        """
         if reset:
             self._tokenizer.normalizer = normalizers.Sequence(
                 [

From 76e0e351fc92480880f96eae497a2e407c862c4e Mon Sep 17 00:00:00 2001
From: Arthur Zucker 
Date: Fri, 8 Sep 2023 14:20:25 +0000
Subject: [PATCH 3/5] fixups

---
 .../models/code_llama/tokenization_code_llama_fast.py       | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
index 6e32783dec8d..5fc34b615559 100644
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@@ -257,10 +257,8 @@ def add_bos_token(self, value):
 
     def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
         """
-        Updates the normalizer to make sure the prompt format for `infilling` is respected.
-        If `reset` is set to `True`, the normalizer is reset to the normal behaviour.
-        The infilling format is the following:
-        if suffix_first
+        Updates the normalizer to make sure the prompt format for `infilling` is respected. If `reset` is set to
+        `True`, the normalizer is reset to the normal behaviour. The infilling format is the following: if suffix_first
             " 
 {suf}  {pre}"
         else:
             " 
 {pre} {suf} "

From cceb8c3e1822fe44151ec852a2c9ede2f54d19ae Mon Sep 17 00:00:00 2001
From: Arthur Zucker 
Date: Fri, 8 Sep 2023 18:30:26 +0000
Subject: [PATCH 4/5] more details in the docuemtation about the tokenization

---
 .../models/code_llama/tokenization_code_llama_fast.py      | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
index 5fc34b615559..9bcf18f760a8 100644
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@@ -257,11 +257,14 @@ def add_bos_token(self, value):
 
     def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
         """
-        Updates the normalizer to make sure the prompt format for `infilling` is respected. If `reset` is set to
-        `True`, the normalizer is reset to the normal behaviour. The infilling format is the following: if suffix_first
+        Updates the normalizer to make sure the prompt format for `infilling` is respected. The infilling format is the following: if suffix_first
             " 
 {suf}  {pre}"
         else:
             " 
 {pre} {suf} "
+
+        If `reset` is set to `True`, the `normalizer` and `post_processor` are reset to
+        their "normal" behaviour, which is to add a prefix space for the normalizer,
+        and add a `bos_token` to the input text for the `post_processor`.
         """
         if reset:
             self._tokenizer.normalizer = normalizers.Sequence(

From 5c2f404409e2a7a12662f437dc8fc78d2bba521c Mon Sep 17 00:00:00 2001
From: Arthur Zucker 
Date: Fri, 8 Sep 2023 18:30:30 +0000
Subject: [PATCH 5/5] styl;e

---
 .../models/code_llama/tokenization_code_llama_fast.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
index 9bcf18f760a8..768946ea35f7 100644
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@@ -257,14 +257,14 @@ def add_bos_token(self, value):
 
     def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
         """
-        Updates the normalizer to make sure the prompt format for `infilling` is respected. The infilling format is the following: if suffix_first
+        Updates the normalizer to make sure the prompt format for `infilling` is respected. The infilling format is the
+        following: if suffix_first
             " 
 {suf}  {pre}"
         else:
             " 
 {pre} {suf} "
 
-        If `reset` is set to `True`, the `normalizer` and `post_processor` are reset to
-        their "normal" behaviour, which is to add a prefix space for the normalizer,
-        and add a `bos_token` to the input text for the `post_processor`.
+        If `reset` is set to `True`, the `normalizer` and `post_processor` are reset to their "normal" behaviour, which
+        is to add a prefix space for the normalizer, and add a `bos_token` to the input text for the `post_processor`.
         """
         if reset:
             self._tokenizer.normalizer = normalizers.Sequence(