From 31aad90baa89b4ac85e2d9bd1f1a89b4a7e6704d Mon Sep 17 00:00:00 2001
From: Denis Kocetkov <denis.kocetkov@servicenow.com>
Date: Wed, 31 Aug 2022 04:41:27 -0400
Subject: [PATCH 01/29] add: 2 variants of multi query implementation; printing
 some details

---
 src/transformers/models/gpt2/modeling_gpt2.py | 183 ++++++++++++++++--
 1 file changed, 172 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 1c61adb10d..911874d764 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -53,6 +53,12 @@
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
 from .configuration_gpt2 import GPT2Config
 
+from enum import Enum
+class AttentionType(Enum):
+    MULTI_HEAD = 1
+    MULTI_QUERY = 2
+    MULTI_QUERY_1 = 3
+
 
 logger = logging.get_logger(__name__)
 
@@ -139,11 +145,28 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
             ),
         )
         self.register_buffer("masked_bias", torch.tensor(-1e4))
+        
+        if hasattr(config, 'attention_type'):
+            self.attention_type = config.attention_type
+        else:
+            self.attention_type = AttentionType.MULTI_HEAD
 
+        assert (
+            self.attention_type == AttentionType.MULTI_HEAD or
+            self.attention_type == AttentionType.MULTI_QUERY or
+            self.attention_type == AttentionType.MULTI_QUERY_1
+        )
+
+        if hasattr(config, 'print_details') and config.print_details is True:
+            self.print_details = layer_idx == 0
+        else:
+            self.print_details = False
+        
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
         self.split_size = self.embed_dim
+        
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
                 f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
@@ -157,18 +180,37 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
         self.layer_idx = layer_idx
         self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+        if self.reorder_and_upcast_attn and self.attention_type != AttentionType.MULTI_HEAD:
+                raise NotImplementedError(f'attention_type {self.attention_type} for reorder_and_upcast_attn')
 
         if self.is_cross_attention:
+            if self.attention_type != AttentionType.MULTI_HEAD:
+                raise NotImplementedError(f'attention_type {self.attention_type}  for cross_attention')
+
             self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
             self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
         else:
-            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+            if self.attention_type != AttentionType.MULTI_HEAD:
+                self.c_attn = Conv1D((self.num_heads + 2) * self.head_dim, self.embed_dim)
+            else:
+                self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
 
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
         self.pruned_heads = set()
+        
+        if self.print_details:
+            print('Attention info________________________________________________')
+            print('max_positions ', max_positions)
+            print('self.embed_dim ', self.embed_dim)
+            print('self.num_heads ', self.num_heads)
+            print('self.head_dim ', self.head_dim)
+            print('self.split_size ', self.split_size)
+            print('self.c_attn', self.c_attn)
+            print('______________________________________________________________')
 
     def prune_heads(self, heads):
         if len(heads) == 0:
@@ -186,7 +228,22 @@ def prune_heads(self, heads):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+        if self.attention_type == AttentionType.MULTI_QUERY_1:
+            batch_size = query.shape[0]
+            query_length = query.shape[1] // self.num_heads
+            key_length = key.shape[-1]
+            attn_weights = torch.bmm(query, key)
+            if self.print_details:
+                print('query: ', query.shape)
+                print('key: ', key.shape)
+                print('attn_weights: ', attn_weights.shape)
+            attn_weights = attn_weights.view(batch_size, self.num_heads, query_length, key_length)
+        else:
+            attn_weights = torch.matmul(query, key.transpose(-1, -2))
+            if self.print_details:
+                print('query: ', query.shape)
+                print('key.transpose(-1, -2): ', key.transpose(-1, -2).shape)
+                print('attn_weights: ', attn_weights.shape)
 
         if self.scale_attn_weights:
             attn_weights = attn_weights / torch.tensor(
@@ -199,7 +256,13 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
 
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
+            if self.attention_type != AttentionType.MULTI_QUERY_1:
+                query_length, key_length = query.size(-2), key.size(-2)
+            if self.print_details:
+                print('query', query.shape)
+                print('key', key.shape)
+                print('query_length', query_length)
+                print('key_length', key_length)
             causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
@@ -221,7 +284,12 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
 
-        attn_output = torch.matmul(attn_weights, value)
+        if self.attention_type == AttentionType.MULTI_QUERY_1:
+            attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
+            attn_output = torch.bmm(attn_weights, value)
+            attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+        else:
+            attn_output = torch.matmul(attn_weights, value)
 
         return attn_output, attn_weights
 
@@ -309,7 +377,13 @@ def forward(
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+        
+        if self.print_details:
+            print('Attention_______________________________________________')
         if encoder_hidden_states is not None:
+            if self.attention_type != AttentionType.MULTI_HEAD:
+                raise NotImplementedError(f'attention_type {self.attention_type}  for encoder_hidden_states')
+
             if not hasattr(self, "q_attn"):
                 raise ValueError(
                     "If class is used as cross attention, the weights `q_attn` have to be defined. "
@@ -320,15 +394,39 @@ def forward(
             key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
             attention_mask = encoder_attention_mask
         else:
-            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
+            if self.attention_type != AttentionType.MULTI_HEAD:
+                query, key, value = self.c_attn(hidden_states).split(
+                    (self.num_heads*self.head_dim, self.head_dim, self.head_dim),
+                    dim=2
+                )
+            else:
+                query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+
+        if self.attention_type == AttentionType.MULTI_QUERY_1:
+            batch_size, seq_length = hidden_states.shape[:2]
+            query = query.view(
+                batch_size, seq_length, self.num_heads, self.head_dim,
+            ).reshape(
+                batch_size, seq_length * self.num_heads, self.head_dim
+            )
+            key = key.permute(0, 2, 1) # [batch_size, head_dim, seq_length]
+            # value [batch_size, seq_length, head_dim]
+        elif self.attention_type == AttentionType.MULTI_QUERY:
+            query = self._split_heads(query, self.num_heads, self.head_dim)
+            key = self._split_heads(key, 1, self.head_dim)
+            value = self._split_heads(value, 1, self.head_dim)
+        else:
+            query = self._split_heads(query, self.num_heads, self.head_dim)
+            key = self._split_heads(key, self.num_heads, self.head_dim)
+            value = self._split_heads(value, self.num_heads, self.head_dim)
 
         if layer_past is not None:
             past_key, past_value = layer_past
-            key = torch.cat((past_key, key), dim=-2)
+            if self.attention_type == AttentionType.MULTI_QUERY_1:
+                key = torch.cat((past_key, key), dim=-1)
+            else:
+                key = torch.cat((past_key, key), dim=-2)
+
             value = torch.cat((past_value, value), dim=-2)
 
         if use_cache is True:
@@ -385,6 +483,11 @@ def __init__(self, config, layer_idx=None):
 
         self.mlp = GPT2MLP(inner_dim, config)
 
+        if hasattr(config, 'print_details') and config.print_details is True:
+            self.print_details = layer_idx == 0
+        else:
+            self.print_details = False
+
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
@@ -398,6 +501,9 @@ def forward(
     ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
+        if self.print_details:
+            print('hidden_states, ba', hidden_states.size())
+            print('attention_mask, ba', attention_mask.size())
         attn_outputs = self.attn(
             hidden_states,
             layer_past=layer_past,
@@ -696,6 +802,11 @@ def __init__(self, config):
         self.device_map = None
         self.gradient_checkpointing = False
 
+        if hasattr(config, 'print_details') and config.print_details is True:
+            self.print_details = True
+        else:
+            self.print_details = False
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -775,6 +886,8 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if self.print_details:
+            print('-------------startd forward-----------------------------------------')
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -793,6 +906,9 @@ def forward(
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
         if position_ids is not None:
             position_ids = position_ids.view(-1, input_shape[-1])
+        if self.print_details:
+            print('token_type_ids ', token_type_ids is not None)
+            print('position_ids ', position_ids is not None)
 
         if past_key_values is None:
             past_length = 0
@@ -802,18 +918,31 @@ def forward(
         if position_ids is None:
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
             position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+        if self.print_details:
+            print('past_length ', past_length)
+            print('position_ids', position_ids.size(), position_ids)
+            print('past_key_values', len(past_key_values))
 
         # GPT2Attention mask.
+        if self.print_details:
+            print('attention_mask ', attention_mask is not None)
         if attention_mask is not None:
             if batch_size <= 0:
                 raise ValueError("batch_size has to be defined and > 0")
+            if self.print_details:
+                print('attention_mask ', attention_mask.size())
             attention_mask = attention_mask.view(batch_size, -1)
+            if self.print_details:
+                print('attention_mask ', attention_mask.size())
             # We create a 3D attention mask from a 2D tensor mask.
             # Sizes are [batch_size, 1, 1, to_seq_length]
             # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
             # this attention mask is more simple than the triangular masking of causal attention
             # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
             attention_mask = attention_mask[:, None, None, :]
+            if self.print_details:
+                print('attention_mask ', attention_mask.size())
+                print(attention_mask)
 
             # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
             # masked positions, this operation will create a tensor which is 0.0 for
@@ -822,7 +951,12 @@ def forward(
             # effectively the same as removing these entirely.
             attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
             attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+            if self.print_details:
+                print(attention_mask)
+                print(torch.finfo(self.dtype).min)
 
+        if self.print_details:
+            print('encoder_hidden_states ', encoder_hidden_states is not None)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.config.add_cross_attention and encoder_hidden_states is not None:
@@ -839,11 +973,17 @@ def forward(
         # attention_probs has shape bsz x n_heads x N x N
         # head_mask has shape n_layer x batch x n_heads x N x N
         head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        if self.print_details:
+            print('head_mask', len(head_mask))
 
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
         position_embeds = self.wpe(position_ids)
         hidden_states = inputs_embeds + position_embeds
+        if self.print_details:
+            print('inputs_embeds', inputs_embeds.size())
+            print('position_embeds', position_embeds.size())
+            print('hidden_states', hidden_states.size())
 
         if token_type_ids is not None:
             token_type_embeds = self.wte(token_type_ids)
@@ -852,7 +992,14 @@ def forward(
         hidden_states = self.drop(hidden_states)
 
         output_shape = input_shape + (hidden_states.size(-1),)
-
+        if self.print_details:
+            print('output_shape ', output_shape)
+            print('input_shape ', input_shape)
+            print('hidden_states.size ', hidden_states.size())
+
+            print('use_cache', use_cache)
+            print('output_attentions', output_attentions)
+            print('output_hidden_states', output_attentions)
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@@ -898,6 +1045,16 @@ def custom_forward(*inputs):
                     encoder_attention_mask,
                 )
             else:
+                if self.print_details and i == 0:
+                    print('Block .......................................................................')
+                    print('hidden_states', hidden_states.size())
+                    print('attention_mask', attention_mask.size())
+                    print('layer_past', False if layer_past is None else [st.shape for st in layer_past])
+                    print('head_mask[i]', False if head_mask[i] is None else head_mask[i].size())
+                    print('encoder_hidden_states', False if encoder_hidden_states is None else encoder_hidden_states.size())
+                    print('encoder_attention_mask', False if encoder_attention_mask is None else encoder_attention_mask.size())
+                    print('use_cache', use_cache)
+                    print('output_attentions', output_attentions)
                 outputs = block(
                     hidden_states,
                     layer_past=layer_past,
@@ -908,6 +1065,8 @@ def custom_forward(*inputs):
                     use_cache=use_cache,
                     output_attentions=output_attentions,
                 )
+                if self.print_details and i == 0:
+                    print(len(outputs))
 
             hidden_states = outputs[0]
             if use_cache is True:
@@ -937,6 +1096,8 @@ def custom_forward(*inputs):
                 for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
                 if v is not None
             )
+        if self.print_details:
+            print('-------------finish forward-----------------------------------------')
 
         return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,

From 9c13b66591846c50eb0dfff9deb6b439e2b87868 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Mon, 31 Oct 2022 21:47:08 -0400
Subject: [PATCH 02/29] Unpin PyTorch

---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index e4465a6e1f..9266a1de7f 100644
--- a/setup.py
+++ b/setup.py
@@ -163,7 +163,7 @@
     "timeout-decorator",
     "timm",
     "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch>=1.7,!=1.12.0,<1.13.0",
+    "torch>=1.7,!=1.12.0",
     "torchaudio",
     "pyctcdecode>=0.4.0",
     "tqdm>=4.27",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 22a403a49f..1d6223f2a7 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -69,7 +69,7 @@
     "timeout-decorator": "timeout-decorator",
     "timm": "timm",
     "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch": "torch>=1.7,!=1.12.0,<1.13.0",
+    "torch": "torch>=1.7,!=1.12.0",
     "torchaudio": "torchaudio",
     "pyctcdecode": "pyctcdecode>=0.4.0",
     "tqdm": "tqdm>=4.27",

From 1ebb3f7e59f996a88a5299027b2b1ed6d000b72f Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Mon, 31 Oct 2022 21:47:49 -0400
Subject: [PATCH 03/29] Release v4.24.0

---
 README.md                                                   | 6 +++---
 README_es.md                                                | 6 +++---
 README_ko.md                                                | 6 +++---
 README_zh-hans.md                                           | 6 +++---
 README_zh-hant.md                                           | 6 +++---
 examples/flax/question-answering/run_qa.py                  | 2 +-
 examples/flax/text-classification/run_flax_glue.py          | 2 +-
 examples/flax/token-classification/run_flax_ner.py          | 2 +-
 .../audio-classification/run_audio_classification.py        | 2 +-
 examples/pytorch/contrastive-image-text/run_clip.py         | 2 +-
 .../image-classification/run_image_classification.py        | 2 +-
 .../run_image_classification_no_trainer.py                  | 2 +-
 examples/pytorch/image-pretraining/run_mae.py               | 2 +-
 examples/pytorch/image-pretraining/run_mim.py               | 2 +-
 examples/pytorch/language-modeling/run_clm.py               | 2 +-
 examples/pytorch/language-modeling/run_clm_no_trainer.py    | 2 +-
 examples/pytorch/language-modeling/run_mlm.py               | 2 +-
 examples/pytorch/language-modeling/run_mlm_no_trainer.py    | 2 +-
 examples/pytorch/language-modeling/run_plm.py               | 2 +-
 examples/pytorch/multiple-choice/run_swag.py                | 2 +-
 examples/pytorch/multiple-choice/run_swag_no_trainer.py     | 2 +-
 examples/pytorch/question-answering/run_qa.py               | 2 +-
 examples/pytorch/question-answering/run_qa_beam_search.py   | 2 +-
 .../question-answering/run_qa_beam_search_no_trainer.py     | 2 +-
 examples/pytorch/question-answering/run_qa_no_trainer.py    | 2 +-
 examples/pytorch/question-answering/run_seq2seq_qa.py       | 2 +-
 .../semantic-segmentation/run_semantic_segmentation.py      | 2 +-
 .../run_semantic_segmentation_no_trainer.py                 | 2 +-
 .../speech-recognition/run_speech_recognition_ctc.py        | 2 +-
 .../speech-recognition/run_speech_recognition_seq2seq.py    | 2 +-
 examples/pytorch/summarization/run_summarization.py         | 2 +-
 .../pytorch/summarization/run_summarization_no_trainer.py   | 2 +-
 examples/pytorch/text-classification/run_glue.py            | 2 +-
 examples/pytorch/text-classification/run_glue_no_trainer.py | 2 +-
 examples/pytorch/text-classification/run_xnli.py            | 2 +-
 examples/pytorch/token-classification/run_ner.py            | 2 +-
 examples/pytorch/token-classification/run_ner_no_trainer.py | 2 +-
 examples/pytorch/translation/run_translation.py             | 2 +-
 examples/pytorch/translation/run_translation_no_trainer.py  | 2 +-
 examples/tensorflow/multiple-choice/run_swag.py             | 2 +-
 examples/tensorflow/question-answering/run_qa.py            | 2 +-
 examples/tensorflow/summarization/run_summarization.py      | 2 +-
 examples/tensorflow/text-classification/run_glue.py         | 2 +-
 examples/tensorflow/translation/run_translation.py          | 2 +-
 setup.py                                                    | 2 +-
 src/transformers/__init__.py                                | 2 +-
 46 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/README.md b/README.md
index 37807136b7..e7e9c17f42 100644
--- a/README.md
+++ b/README.md
@@ -302,7 +302,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/main/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
@@ -324,7 +324,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/main/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@@ -376,7 +376,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (from HuggingFace).
diff --git a/README_es.md b/README_es.md
index a5aeaee72c..5eec3153b1 100644
--- a/README_es.md
+++ b/README_es.md
@@ -302,7 +302,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/main/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
+1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
@@ -324,7 +324,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/main/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@@ -376,7 +376,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)**  (from HuggingFace).
diff --git a/README_ko.md b/README_ko.md
index fe7c0f3442..e0a3c0a232 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -252,7 +252,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/main/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
+1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
@@ -274,7 +274,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/main/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@@ -326,7 +326,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
diff --git a/README_zh-hans.md b/README_zh-hans.md
index d02490c9d0..4119b7876c 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -276,7 +276,7 @@ conda install -c huggingface transformers
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (来自 Baidu) 伴随论文 [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu 发布。
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/main/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
+1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (来自 Facebook AI) 伴随论文 [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) 由 Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela 发布。
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
@@ -298,7 +298,7 @@ conda install -c huggingface transformers
 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (来自 Meta AI) 伴随论文 [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) 由 Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze 发布。
-1. **[LiLT](https://huggingface.co/docs/transformers/main/model_doc/lilt)** (来自 South China University of Technology) 伴随论文 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 由 Jiapeng Wang, Lianwen Jin, Kai Ding 发布。
+1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (来自 South China University of Technology) 伴随论文 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 由 Jiapeng Wang, Lianwen Jin, Kai Ding 发布。
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
@@ -350,7 +350,7 @@ conda install -c huggingface transformers
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
-1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。
+1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (来自 Microsoft Research) 伴随论文 [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) 由 Brandon Smock, Rohith Pesala, Robin Abraham 发布。
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (来自 Microsoft Research) 伴随论文 [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) 由 Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou 发布。
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
diff --git a/README_zh-hant.md b/README_zh-hant.md
index f506e41687..97ee14685d 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -288,7 +288,7 @@ conda install -c huggingface transformers
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
-1. **[FLAN-T5](https://huggingface.co/docs/transformers/main/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
+1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
@@ -310,7 +310,7 @@ conda install -c huggingface transformers
 1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
-1. **[LiLT](https://huggingface.co/docs/transformers/main/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
@@ -362,7 +362,7 @@ conda install -c huggingface transformers
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[Table Transformer](https://huggingface.co/docs/transformers/main/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
 1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 27c52a1ebc..fe113bf58e 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -61,7 +61,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 4b9a8554d2..8ee0706d0b 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -54,7 +54,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 77b734aba1..05ca03cd58 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -55,7 +55,7 @@
 
 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index 9b2f3931b4..732a77e4c5 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -45,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index 9f3f603b70..47db60e998 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -54,7 +54,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index 13f137450d..c91787217a 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -55,7 +55,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 664a96c3c2..841fb527fb 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
index f9419fa9b1..5c7f07d62c 100644
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@@ -43,7 +43,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py
index 6181920a1e..2b8d497d0b 100644
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@@ -48,7 +48,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index da8724fded..b28512ae04 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -54,7 +54,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index d22fa4a49a..f29a0f1ab4 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index e98d5c3cc8..0ac1f74dc2 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 525bdf3689..8a88192c88 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index fba8746bbc..119c21d635 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index a9e63335df..1651f917d4 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index a03accc68d..08b50bab43 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index d7348f37c0..7db283612a 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 61001f1120..9f271d7d23 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 064d3ec1fd..59909f25b4 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 9e4f762aea..deb6d0805f 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index 0bf551d17a..1e02b556f0 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -45,7 +45,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 21fdc9e76e..404716aeed 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -51,7 +51,7 @@
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")
 
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index ffbcd97aa0..5bbf3f7d2c 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index cfc530354c..ff9df06114 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 4a08b92405..8d9ed8d54a 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index c0da4a59cb..5ee12861ec 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 155f41d509..3bd925569b 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 6e343d1af9..59640d32b7 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 5f54560d24..013af4f0e9 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 5878419c1e..c3545a3cec 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index edbc3e3d27..d5958c70e6 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -49,7 +49,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 6a273428cc..a127fe2cdb 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index a2c446b588..a9584ed10a 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -52,7 +52,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index db4e4af3ff..bfaf388406 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -57,7 +57,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index fa5f13b726..a3e608742e 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index ebcdb71160..3fbed7aefa 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -48,7 +48,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index a400eb4b63..ffc9039b78 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -53,7 +53,7 @@
 
 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index a54f858b8c..38dd0b2d32 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 task_to_keys = {
     "cola": ("sentence", None),
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 627cb856de..0ada9e0b1e 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -56,7 +56,7 @@
 
 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.24.0.dev0")
+check_min_version("4.24.0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/setup.py b/setup.py
index 9266a1de7f..3a9bce4ff8 100644
--- a/setup.py
+++ b/setup.py
@@ -409,7 +409,7 @@ def run(self):
 
 setup(
     name="transformers",
-    version="4.24.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.24.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
     author_email="transformers@huggingface.co",
     description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e4d8e66de2..3f3e79c9fc 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -22,7 +22,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
 
-__version__ = "4.24.0.dev0"
+__version__ = "4.24.0"
 
 from typing import TYPE_CHECKING
 

From 0e654e04dc88525701b3c3cf2e3655428a7529ac Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Tue, 1 Nov 2022 17:20:42 +0530
Subject: [PATCH 04/29] Added onnx config whisper (#19525)

* Added onnx config whisper

* added whisper support onnx

* add audio input data

* added whisper support onnx

* fixed the seqlength value

* Updated the whisper onnx ocnfig

* restore files to old version

* removed attention mask from inputs

* Updated get_dummy_input_onnxruntime docstring

* Updated relative imports and token generation

* update docstring
---
 docs/source/en/serialization.mdx              |  1 +
 src/transformers/models/whisper/__init__.py   |  4 +-
 .../models/whisper/configuration_whisper.py   | 65 +++++++++++++++++++
 src/transformers/onnx/config.py               | 51 ++++++++++++++-
 src/transformers/onnx/convert.py              | 21 +++++-
 src/transformers/onnx/features.py             |  9 +++
 tests/onnx/test_onnx_v2.py                    |  3 +-
 7 files changed, 148 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index db4f8aa83d..1cbc1237f2 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -100,6 +100,7 @@ Ready-made configurations include the following architectures:
 - Table Transformer
 - Vision Encoder decoder
 - ViT
+- Whisper
 - XLM
 - XLM-RoBERTa
 - XLM-RoBERTa-XL
diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py
index 71e354a936..2528e03a4d 100644
--- a/src/transformers/models/whisper/__init__.py
+++ b/src/transformers/models/whisper/__init__.py
@@ -21,7 +21,7 @@
 
 
 _import_structure = {
-    "configuration_whisper": ["WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP", "WhisperConfig"],
+    "configuration_whisper": ["WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP", "WhisperConfig", "WhisperOnnxConfig"],
     "feature_extraction_whisper": ["WhisperFeatureExtractor"],
     "processing_whisper": ["WhisperProcessor"],
     "tokenization_whisper": ["WhisperTokenizer"],
@@ -55,7 +55,7 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig
+    from .configuration_whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig, WhisperOnnxConfig
     from .feature_extraction_whisper import WhisperFeatureExtractor
     from .processing_whisper import WhisperProcessor
     from .tokenization_whisper import WhisperTokenizer
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 6ee5ee9057..c25dab667d 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -14,10 +14,19 @@
 # limitations under the License.
 """ Whisper model configuration"""
 
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
 from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
 from ...utils import logging
 
 
+if TYPE_CHECKING:
+    from ...feature_extraction_utils import FeatureExtractionMixin
+    from ...tokenization_utils_base import PreTrainedTokenizerBase
+    from ...utils import TensorType
+
 logger = logging.get_logger(__name__)
 
 WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
@@ -214,3 +223,59 @@ def __init__(
             begin_suppress_tokens=begin_suppress_tokens,
             **kwargs,
         )
+
+
+class WhisperOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict(
+            [
+                ("input_features", {0: "batch", 1: "feature_size", 2: "encoder_sequence"}),
+            ]
+        )
+        if self.use_past:
+            common_inputs["decoder_input_ids"] = {0: "batch"}
+        else:
+            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+
+        return common_inputs
+
+    def generate_dummy_inputs(
+        self,
+        preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional["TensorType"] = None,
+        sampling_rate: int = 22050,
+        time_duration: float = 5.0,
+        frequency: int = 220,
+    ) -> Mapping[str, Any]:
+        dummy_inputs = OrderedDict()
+        encoder_inputs = OnnxConfig.generate_dummy_inputs(
+            self,
+            preprocessor=preprocessor.feature_extractor,
+            batch_size=batch_size,
+            framework=framework,
+            sampling_rate=sampling_rate,
+            time_duration=time_duration,
+            frequency=frequency,
+        )
+        decoder_inputs = super().generate_dummy_inputs(
+            preprocessor.tokenizer, batch_size, seq_length, is_pair, framework
+        )
+
+        dummy_inputs["input_features"] = encoder_inputs.pop("input_features")
+        dummy_inputs["decoder_input_ids"] = decoder_inputs.pop("decoder_input_ids")
+
+        if "past_key_values" in decoder_inputs:
+            dummy_inputs["past_key_values"] = decoder_inputs.pop("past_key_values")
+
+        return dummy_inputs
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-3
diff --git a/src/transformers/onnx/config.py b/src/transformers/onnx/config.py
index 5a1c3e6eed..1c8d10939a 100644
--- a/src/transformers/onnx/config.py
+++ b/src/transformers/onnx/config.py
@@ -104,6 +104,7 @@ class OnnxConfig(ABC):
         "sequence-classification": OrderedDict({"logits": {0: "batch"}}),
         "token-classification": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
         "vision2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
+        "speech2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
     }
 
     def __init__(self, config: "PretrainedConfig", task: str = "default", patching_specs: List[PatchingSpec] = None):
@@ -262,6 +263,19 @@ def _generate_dummy_images(
             images.append(Image.fromarray(data.astype("uint8")).convert("RGB"))
         return images
 
+    def _generate_dummy_audio(
+        self, batch_size: int = 2, sampling_rate: int = 22050, time_duration: float = 5.0, frequency: int = 220
+    ):
+        audio_data = []
+        for _ in range(batch_size):
+            # time variable
+            t = np.linspace(0, time_duration, int(time_duration * sampling_rate), endpoint=False)
+
+            # generate pure sine wave at `frequency` Hz
+            audio_data.append(0.5 * np.sin(2 * np.pi * frequency * t))
+
+        return audio_data
+
     def generate_dummy_inputs(
         self,
         preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
@@ -273,6 +287,9 @@ def generate_dummy_inputs(
         num_channels: int = 3,
         image_width: int = 40,
         image_height: int = 40,
+        sampling_rate: int = 22050,
+        time_duration: float = 5.0,
+        frequency: int = 220,
         tokenizer: "PreTrainedTokenizerBase" = None,
     ) -> Mapping[str, Any]:
         """
@@ -297,6 +314,12 @@ def generate_dummy_inputs(
                 The width of the generated images.
             image_height (`int`, *optional*, defaults to 40):
                 The height of the generated images.
+            sampling_rate (`int`, *optional* defaults to 22050)
+                The sampling rate for audio data generation.
+            time_duration (`float`, *optional* defaults to 5.0)
+                Total seconds of sampling for audio data generation.
+            frequency (`int`, *optional* defaults to 220)
+                The desired natural frequency of generated audio.
 
         Returns:
             Mapping[str, Tensor] holding the kwargs to provide to the model's forward function
@@ -325,7 +348,12 @@ def generate_dummy_inputs(
                 seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
             )
             # Generate dummy inputs according to compute batch and sequence
-            dummy_input = [" ".join([preprocessor.unk_token]) * seq_length] * batch_size
+            input_token = (
+                preprocessor.unk_token
+                if (preprocessor.unk_token is not None and len(preprocessor.unk_token) > 0)
+                else "0"
+            )
+            dummy_input = [" ".join([input_token]) * seq_length] * batch_size
             if self.task == "multiple-choice":
                 # If dynamic axis (-1) we forward with a fixed dimension of 4 candidate answers to avoid optimizations
                 # made by ONNX
@@ -345,11 +373,32 @@ def generate_dummy_inputs(
             batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
             dummy_input = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
             return dict(preprocessor(images=dummy_input, return_tensors=framework))
+        elif (
+            isinstance(preprocessor, FeatureExtractionMixin) and preprocessor.model_input_names[0] == "input_features"
+        ):
+            # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+            batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
+            dummy_input = self._generate_dummy_audio(batch_size, sampling_rate, time_duration, frequency)
+            return dict(preprocessor(dummy_input, return_tensors=framework))
         else:
             raise ValueError(
                 "Unable to generate dummy inputs for the model. Please provide a tokenizer or a preprocessor."
             )
 
+    def generate_dummy_inputs_onnxruntime(self, reference_model_inputs: Mapping[str, Any]) -> Mapping[str, Any]:
+        """
+        Generate inputs for ONNX Runtime using the reference model inputs. Override this to run inference with seq2seq
+        models which have the encoder and decoder exported as separate ONNX files.
+
+        Args:
+            reference_model_inputs ([`Mapping[str, Tensor]`):
+                Reference inputs for the model.
+
+        Returns:
+            `Mapping[str, Tensor]`: The mapping holding the kwargs to provide to the model's forward function
+        """
+        return reference_model_inputs
+
     def patch_ops(self):
         for spec in self._patching_specs:
             custom_op = spec.custom_op if spec.op_wrapper is None else spec.op_wrapper(spec.custom_op)
diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py
index 234724699e..e953207b3a 100644
--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -145,7 +145,21 @@ def export_pytorch(
             device = torch.device(device)
             if device.type == "cuda" and torch.cuda.is_available():
                 model.to(device)
-                model_inputs = dict((k, v.to(device)) for k, v in model_inputs.items())
+                model_inputs_device = dict()
+                for k, v in model_inputs.items():
+                    if isinstance(v, Tuple):
+                        model_inputs_device[k] = tuple(
+                            x.to(device) if isinstance(x, torch.Tensor) else None for x in v
+                        )
+                    elif isinstance(v, List):
+                        model_inputs_device[k] = [
+                            tuple(x.to(device) if isinstance(x, torch.Tensor) else None for x in t) for t in v
+                        ]
+                    else:
+                        model_inputs_device[k] = v.to(device)
+
+                model_inputs = model_inputs_device
+
             inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
             onnx_outputs = list(config.outputs.keys())
 
@@ -404,9 +418,12 @@ def validate_model_outputs(
         else:
             ref_outputs_dict[name] = value
 
+    # Create onnxruntime inputs from the reference model inputs
+    reference_model_inputs_onnxruntime = config.generate_dummy_inputs_onnxruntime(reference_model_inputs)
+
     # We flatten potential collection of inputs (i.e. past_keys)
     onnx_inputs = {}
-    for name, value in reference_model_inputs.items():
+    for name, value in reference_model_inputs_onnxruntime.items():
         if isinstance(value, (list, tuple)):
             value = config.flatten_output_collection_property(name, value)
             onnx_inputs.update({tensor_name: pt_tensor.numpy() for tensor_name, pt_tensor in value.items()})
diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py
index 878fcce651..8e69c5a1a0 100644
--- a/src/transformers/onnx/features.py
+++ b/src/transformers/onnx/features.py
@@ -29,6 +29,7 @@
         AutoModelForSemanticSegmentation,
         AutoModelForSeq2SeqLM,
         AutoModelForSequenceClassification,
+        AutoModelForSpeechSeq2Seq,
         AutoModelForTokenClassification,
         AutoModelForVision2Seq,
     )
@@ -100,6 +101,7 @@ class FeaturesManager:
             "masked-im": AutoModelForMaskedImageModeling,
             "semantic-segmentation": AutoModelForSemanticSegmentation,
             "vision2seq-lm": AutoModelForVision2Seq,
+            "speech2seq-lm": AutoModelForSpeechSeq2Seq,
         }
     if is_tf_available():
         _TASKS_TO_TF_AUTOMODELS = {
@@ -492,6 +494,13 @@ class FeaturesManager:
         "vit": supported_features_mapping(
             "default", "image-classification", "masked-im", onnx_config_cls="models.vit.ViTOnnxConfig"
         ),
+        "whisper": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "speech2seq-lm",
+            "speech2seq-lm-with-past",
+            onnx_config_cls="models.whisper.WhisperOnnxConfig",
+        ),
         "xlm": supported_features_mapping(
             "default",
             "masked-lm",
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index eac6ee0634..ab8610db71 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -218,6 +218,7 @@ def test_values_override(self):
     ("yolos", "hustvl/yolos-tiny"),
     ("segformer", "nvidia/segformer-b0-finetuned-ade-512-512"),
     ("swin", "microsoft/swin-tiny-patch4-window7-224"),
+    ("whisper", "openai/whisper-tiny.en"),
 }
 
 PYTORCH_EXPORT_ENCODER_DECODER_MODELS = {
@@ -398,7 +399,7 @@ def _onnx_export_encoder_decoder_models(
         preprocessor = AutoTokenizer.from_pretrained(model_name)
 
         with NamedTemporaryFile("w") as decoder_output:
-            onnx_inputs, onnx_outputs = export(
+            _, onnx_outputs = export(
                 preprocessor,
                 decoder_model,
                 decoder_onnx_config,

From 502d3b6ab0f947faf6cac9a0824994a162e96d9c Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Tue, 1 Nov 2022 07:52:45 -0400
Subject: [PATCH 05/29] Remove pin temporarily to get tests

---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 3a9bce4ff8..7747598a43 100644
--- a/setup.py
+++ b/setup.py
@@ -163,7 +163,7 @@
     "timeout-decorator",
     "timm",
     "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch>=1.7,!=1.12.0",
+    "torch>=1.7,!=1.12.0,<1.13",
     "torchaudio",
     "pyctcdecode>=0.4.0",
     "tqdm>=4.27",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 1d6223f2a7..796a2687d5 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -69,7 +69,7 @@
     "timeout-decorator": "timeout-decorator",
     "timm": "timm",
     "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch": "torch>=1.7,!=1.12.0",
+    "torch": "torch>=1.7,!=1.12.0,<1.13",
     "torchaudio": "torchaudio",
     "pyctcdecode": "pyctcdecode>=0.4.0",
     "tqdm": "tqdm>=4.27",

From 8f95346c97e4abbf9e2ce44b5dc33c6ab829f4fc Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Tue, 1 Nov 2022 13:21:12 +0000
Subject: [PATCH 06/29] Add ESMFold code sample (#20000)

* Add ESMFold code sample

* sorry sylvain

* make fixup

* sorry sylvain again
---
 src/transformers/models/esm/modeling_esmfold.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/esm/modeling_esmfold.py b/src/transformers/models/esm/modeling_esmfold.py
index b32769ee22..6b439c1685 100644
--- a/src/transformers/models/esm/modeling_esmfold.py
+++ b/src/transformers/models/esm/modeling_esmfold.py
@@ -52,6 +52,9 @@
 
 
 logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "Rocketknight1/esmfold_v1"
+_CONFIG_FOR_DOC = "EsmConfig"
+_TOKENIZER_FOR_DOC = "EsmTokenizer"
 
 
 @dataclass
@@ -2092,7 +2095,16 @@ def forward(
 
         Example:
 
-        TODO Matt
+        ```python
+        >>> from transformers import AutoTokenizer, EsmForProteinFolding
+
+        >>> model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
+        >>> inputs = tokenizer(["MLKNVQVQLV"], return_tensors="pt")  # A tiny random peptide
+        >>> outputs = model(**inputs)
+        >>> folded_positions = outputs.positions
+        ```
+
         """
         cfg = self.config.esmfold_config
 

From 94b3f544a1f5e04b78d87a2ae32a7ac252e22e31 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <Sylvain.gugger@gmail.com>
Date: Tue, 1 Nov 2022 09:54:19 -0400
Subject: [PATCH 07/29] Unpin PyTorch for the release

---
 setup.py                                      | 2 +-
 src/transformers/dependency_versions_table.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 7747598a43..3a9bce4ff8 100644
--- a/setup.py
+++ b/setup.py
@@ -163,7 +163,7 @@
     "timeout-decorator",
     "timm",
     "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch>=1.7,!=1.12.0,<1.13",
+    "torch>=1.7,!=1.12.0",
     "torchaudio",
     "pyctcdecode>=0.4.0",
     "tqdm>=4.27",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 796a2687d5..1d6223f2a7 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -69,7 +69,7 @@
     "timeout-decorator": "timeout-decorator",
     "timm": "timm",
     "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14",
-    "torch": "torch>=1.7,!=1.12.0,<1.13",
+    "torch": "torch>=1.7,!=1.12.0",
     "torchaudio": "torchaudio",
     "pyctcdecode": "pyctcdecode>=0.4.0",
     "tqdm": "tqdm>=4.27",

From b7e2124cf726235ccaefe17ff960e9117f86949c Mon Sep 17 00:00:00 2001
From: mayank31398 <mayank31398@gmail.com>
Date: Mon, 5 Dec 2022 19:19:46 +0530
Subject: [PATCH 08/29] fix saving

---
 src/transformers/models/gpt2/modeling_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 0c3386b5a6..6d1bcf7ca6 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -140,7 +140,7 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.register_buffer("masked_bias", torch.tensor(-1e4))
         
         if hasattr(config, 'attention_type'):
-            self.attention_type = config.attention_type
+            self.attention_type = AttentionType(config.attention_type)
         else:
             self.attention_type = AttentionType.MULTI_HEAD
 

From 357ba81e703464dca4a955ebfb61ae81a6947927 Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Fri, 20 Jan 2023 12:17:01 +0000
Subject: [PATCH 09/29] added Raymond MQA variant

---
 .../models/gpt2/configuration_gpt2.py         | 16 ++++
 src/transformers/models/gpt2/modeling_gpt2.py | 90 +++++++++++++------
 2 files changed, 81 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index fe9c711d73..8f6749487a 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -34,6 +34,13 @@
     "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
 }
 
+from enum import Enum
+class AttentionType(Enum):
+    MULTI_HEAD = 1
+    MULTI_QUERY = 2
+    MULTI_QUERY_1 = 3
+    MULTI_QUERY_2 = 4
+
 
 class GPT2Config(PretrainedConfig):
     """
@@ -163,6 +170,7 @@ def __init__(
         eos_token_id=50256,
         scale_attn_by_inverse_layer_idx=False,
         reorder_and_upcast_attn=False,
+        attention_type=AttentionType.MULTI_HEAD,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -190,6 +198,14 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
 
+        self.attention_type = attention_type
+        assert (
+            self.attention_type == AttentionType.MULTI_HEAD or
+            self.attention_type == AttentionType.MULTI_QUERY or
+            self.attention_type == AttentionType.MULTI_QUERY_1 or
+            self.attention_type == AttentionType.MULTI_QUERY_2
+        )
+
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 09407c7ac2..9bcb3b2d3f 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -44,14 +44,7 @@
     replace_return_docstrings,
 )
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
-from .configuration_gpt2 import GPT2Config
-
-from enum import Enum
-class AttentionType(Enum):
-    MULTI_HEAD = 1
-    MULTI_QUERY = 2
-    MULTI_QUERY_1 = 3
-
+from .configuration_gpt2 import GPT2Config, AttentionType
 
 logger = logging.get_logger(__name__)
 
@@ -139,16 +132,7 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         )
         self.register_buffer("masked_bias", torch.tensor(-1e4))
         
-        if hasattr(config, 'attention_type'):
-            self.attention_type = AttentionType(config.attention_type)
-        else:
-            self.attention_type = AttentionType.MULTI_HEAD
-
-        assert (
-            self.attention_type == AttentionType.MULTI_HEAD or
-            self.attention_type == AttentionType.MULTI_QUERY or
-            self.attention_type == AttentionType.MULTI_QUERY_1
-        )
+        self.attention_type = config.attention_type
 
         if hasattr(config, 'print_details') and config.print_details is True:
             self.print_details = layer_idx == 0
@@ -185,6 +169,10 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         else:
             if self.attention_type != AttentionType.MULTI_HEAD:
                 self.c_attn = Conv1D((self.num_heads + 2) * self.head_dim, self.embed_dim)
+            elif self.attention_type != AttentionType.MULTI_QUERY_2:
+                self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+                # Keys and values are shared across heads
+                self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
             else:
                 self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
 
@@ -221,11 +209,22 @@ def prune_heads(self, heads):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        if self.attention_type == AttentionType.MULTI_QUERY_1:
+        if (self.attention_type == AttentionType.MULTI_QUERY_1 or
+            self.attention_type == AttentionType.MULTI_QUERY_2):
+            # query: (b, num_heads * sq, head_dim)
+            # key: (b, head_dim, sk)
+            # value: (b, sk, head_dim)
+            # NOTE: which one is more canonical?
             batch_size = query.shape[0]
             query_length = query.shape[1] // self.num_heads
             key_length = key.shape[-1]
+            #batch_size = query.size(0)
+            #query_length = query.size(1) // self.num_heads
+            #key_length = key.size(2)
+            
+            # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
             attn_weights = torch.bmm(query, key)
+            # -> (b, num_heads, sq, sk)
             if self.print_details:
                 print('query: ', query.shape)
                 print('key: ', key.shape)
@@ -239,9 +238,13 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
                 print('attn_weights: ', attn_weights.shape)
 
         if self.scale_attn_weights:
+            # NOTE: which is faster?
             attn_weights = attn_weights / torch.full(
                 [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
             )
+            # attn_weights = attn_weights / torch.tensor(
+            #    value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+            #)
 
         # Layer-wise attention scaling
         if self.scale_attn_by_inverse_layer_idx:
@@ -260,7 +263,9 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            # NOTE: which is faster?
             mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            #mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
             attn_weights = torch.where(causal_mask, attn_weights, mask_value)
 
         if attention_mask is not None:
@@ -278,9 +283,16 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             attn_weights = attn_weights * head_mask
 
         if self.attention_type == AttentionType.MULTI_QUERY_1:
+            # NOTE: is here ouptud dim order of attn_weights is messed up
             attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
             attn_output = torch.bmm(attn_weights, value)
             attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+        elif self.attention_type == AttentionType.MULTI_QUERY_2:
+            # (b, num_heads, sq, sk) -> (b, num_heads * sq, sk)
+            _attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
+            # (b, num_heads * sq, sk) x (b, sk, head_dim) -> (b, num_heads * sq, head_dim)
+            attn_output = torch.bmm(_attn_weights, value)
+            attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
         else:
             attn_output = torch.matmul(attn_weights, value)
 
@@ -382,15 +394,37 @@ def forward(
             key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
             attention_mask = encoder_attention_mask
         else:
-            if self.attention_type != AttentionType.MULTI_HEAD:
+            if self.attention_type == AttentionType.MULTI_HEAD:
+                query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+            elif self.attention_type == AttentionType.MULTI_QUERY_2:
+                query = self.q_attn(hidden_states)
+                key, value = self.kv_attn(hidden_states).split(self.head_dim, dim=2)
+            else:
                 query, key, value = self.c_attn(hidden_states).split(
                     (self.num_heads*self.head_dim, self.head_dim, self.head_dim),
                     dim=2
                 )
-            else:
-                query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-
-        if self.attention_type == AttentionType.MULTI_QUERY_1:
+                
+
+        if self.attention_type == AttentionType.MULTI_QUERY_2:
+            batch_size, seq_length = query.shape[:2]
+            # (query_length, batch, num_heads, head_dim)
+            # (batch, num_heads * query_length, head_dim)\
+
+            # (batch, query_length, hidden_size) -> (batch, num_heads, query_length, head_dim)
+            query = query.view(batch_size, seq_length, self.num_heads, self.head_dim).permute([0, 2, 1, 3])
+            # -> (batch, num_heads * query_length, head_dim)
+            query = query.reshape(batch_size, self.num_heads * seq_length, self.head_dim)
+
+            # (batch, query_length, hidden_size) -> (batch, query_length * num_heads, head_dim)
+            # query = query.view(
+            #     batch_size, seq_length, self.num_heads, self.head_dim,
+            # ).reshape(
+            #     batch_size, seq_length * self.num_heads, self.head_dim
+            # )
+            key = key.permute(0, 2, 1)  # (batch_size, head_dim, seq_length)
+            # value (batch_size, seq_length, head_dim
+        elif self.attention_type == AttentionType.MULTI_QUERY_1:
             batch_size, seq_length = hidden_states.shape[:2]
             query = query.view(
                 batch_size, seq_length, self.num_heads, self.head_dim,
@@ -410,7 +444,9 @@ def forward(
 
         if layer_past is not None:
             past_key, past_value = layer_past
-            if self.attention_type == AttentionType.MULTI_QUERY_1:
+            # Concatenate on sequence dimension
+            if (self.attention_type == AttentionType.MULTI_QUERY_1 or
+                self.attention_type == AttentionType.MULTI_QUERY_2):
                 key = torch.cat((past_key, key), dim=-1)
             else:
                 key = torch.cat((past_key, key), dim=-2)
@@ -423,6 +459,8 @@ def forward(
             present = None
 
         if self.reorder_and_upcast_attn:
+            # NOTE: exception is raised in __init__ if not multi head attention
+            #raise NotImplementedError("Reorder and upcast attention not implemented for MQA")
             attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
         else:
             attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
@@ -466,6 +504,8 @@ def __init__(self, config, layer_idx=None):
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
 
         if config.add_cross_attention:
+            if config.attention_type != AttentionType.MULTI_HEAD:
+                raise NotImplementedError("Cross-attention not implemented for MQA")
             self.crossattention = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx)
             self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
 

From a96771f0c88f50a60dc27e5a0a08633935c75877 Mon Sep 17 00:00:00 2001
From: denisko <denis.kocetkov@servicenow.com>
Date: Mon, 23 Jan 2023 08:49:12 +0000
Subject: [PATCH 10/29] chg: tensor vs fill acc to comments by Joel

---
 src/transformers/models/gpt2/modeling_gpt2.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 9bcb3b2d3f..992f853373 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -238,13 +238,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
                 print('attn_weights: ', attn_weights.shape)
 
         if self.scale_attn_weights:
-            # NOTE: which is faster?
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            )
-            # attn_weights = attn_weights / torch.tensor(
-            #    value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            #)
+            attn_weights = attn_weights / value.size(-1) ** 0.5
 
         # Layer-wise attention scaling
         if self.scale_attn_by_inverse_layer_idx:
@@ -263,9 +257,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            # NOTE: which is faster?
             mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            #mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
             attn_weights = torch.where(causal_mask, attn_weights, mask_value)
 
         if attention_mask is not None:

From 14f2249445e81207b6aaa2d7cbbda985fcf77d11 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 23 Jan 2023 13:30:19 -0500
Subject: [PATCH 11/29] Style and fix

---
 .../models/gpt2/configuration_gpt2.py         |  12 +-
 src/transformers/models/gpt2/modeling_gpt2.py | 171 +++++++++---------
 2 files changed, 93 insertions(+), 90 deletions(-)

diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index 8f6749487a..41780463c2 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 """ OpenAI GPT-2 configuration"""
 from collections import OrderedDict
+from enum import Enum
 from typing import Any, List, Mapping, Optional
 
 from transformers import PreTrainedTokenizer, TensorType, is_torch_available
@@ -34,7 +35,7 @@
     "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
 }
 
-from enum import Enum
+
 class AttentionType(Enum):
     MULTI_HEAD = 1
     MULTI_QUERY = 2
@@ -198,13 +199,8 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
 
-        self.attention_type = attention_type
-        assert (
-            self.attention_type == AttentionType.MULTI_HEAD or
-            self.attention_type == AttentionType.MULTI_QUERY or
-            self.attention_type == AttentionType.MULTI_QUERY_1 or
-            self.attention_type == AttentionType.MULTI_QUERY_2
-        )
+        # Convert to an int so it's JSON-serializable.
+        self.attention_type = int(AttentionType(attention_type))
 
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 992f853373..dd1a6de53a 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -44,7 +44,8 @@
     replace_return_docstrings,
 )
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
-from .configuration_gpt2 import GPT2Config, AttentionType
+from .configuration_gpt2 import AttentionType, GPT2Config
+
 
 logger = logging.get_logger(__name__)
 
@@ -131,19 +132,19 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
             ),
         )
         self.register_buffer("masked_bias", torch.tensor(-1e4))
-        
-        self.attention_type = config.attention_type
 
-        if hasattr(config, 'print_details') and config.print_details is True:
+        self.attention_type = AttentionType(config.attention_type)
+
+        if hasattr(config, "print_details") and config.print_details is True:
             self.print_details = layer_idx == 0
         else:
             self.print_details = False
-        
+
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
         self.split_size = self.embed_dim
-        
+
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
                 f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
@@ -158,11 +159,11 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.layer_idx = layer_idx
         self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
         if self.reorder_and_upcast_attn and self.attention_type != AttentionType.MULTI_HEAD:
-                raise NotImplementedError(f'attention_type {self.attention_type} for reorder_and_upcast_attn')
+            raise NotImplementedError(f"attention_type {self.attention_type} for reorder_and_upcast_attn")
 
         if self.is_cross_attention:
             if self.attention_type != AttentionType.MULTI_HEAD:
-                raise NotImplementedError(f'attention_type {self.attention_type}  for cross_attention')
+                raise NotImplementedError(f"attention_type {self.attention_type}  for cross_attention")
 
             self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
             self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
@@ -182,16 +183,16 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
 
         self.pruned_heads = set()
-        
+
         if self.print_details:
-            print('Attention info________________________________________________')
-            print('max_positions ', max_positions)
-            print('self.embed_dim ', self.embed_dim)
-            print('self.num_heads ', self.num_heads)
-            print('self.head_dim ', self.head_dim)
-            print('self.split_size ', self.split_size)
-            print('self.c_attn', self.c_attn)
-            print('______________________________________________________________')
+            print("Attention info________________________________________________")
+            print("max_positions ", max_positions)
+            print("self.embed_dim ", self.embed_dim)
+            print("self.num_heads ", self.num_heads)
+            print("self.head_dim ", self.head_dim)
+            print("self.split_size ", self.split_size)
+            print("self.c_attn", self.c_attn)
+            print("______________________________________________________________")
 
     def prune_heads(self, heads):
         if len(heads) == 0:
@@ -209,8 +210,7 @@ def prune_heads(self, heads):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        if (self.attention_type == AttentionType.MULTI_QUERY_1 or
-            self.attention_type == AttentionType.MULTI_QUERY_2):
+        if self.attention_type == AttentionType.MULTI_QUERY_1 or self.attention_type == AttentionType.MULTI_QUERY_2:
             # query: (b, num_heads * sq, head_dim)
             # key: (b, head_dim, sk)
             # value: (b, sk, head_dim)
@@ -218,24 +218,24 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             batch_size = query.shape[0]
             query_length = query.shape[1] // self.num_heads
             key_length = key.shape[-1]
-            #batch_size = query.size(0)
-            #query_length = query.size(1) // self.num_heads
-            #key_length = key.size(2)
-            
+            # batch_size = query.size(0)
+            # query_length = query.size(1) // self.num_heads
+            # key_length = key.size(2)
+
             # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
             attn_weights = torch.bmm(query, key)
             # -> (b, num_heads, sq, sk)
             if self.print_details:
-                print('query: ', query.shape)
-                print('key: ', key.shape)
-                print('attn_weights: ', attn_weights.shape)
+                print("query: ", query.shape)
+                print("key: ", key.shape)
+                print("attn_weights: ", attn_weights.shape)
             attn_weights = attn_weights.view(batch_size, self.num_heads, query_length, key_length)
         else:
             attn_weights = torch.matmul(query, key.transpose(-1, -2))
             if self.print_details:
-                print('query: ', query.shape)
-                print('key.transpose(-1, -2): ', key.transpose(-1, -2).shape)
-                print('attn_weights: ', attn_weights.shape)
+                print("query: ", query.shape)
+                print("key.transpose(-1, -2): ", key.transpose(-1, -2).shape)
+                print("attn_weights: ", attn_weights.shape)
 
         if self.scale_attn_weights:
             attn_weights = attn_weights / value.size(-1) ** 0.5
@@ -249,10 +249,10 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             if self.attention_type != AttentionType.MULTI_QUERY_1:
                 query_length, key_length = query.size(-2), key.size(-2)
             if self.print_details:
-                print('query', query.shape)
-                print('key', key.shape)
-                print('query_length', query_length)
-                print('key_length', key_length)
+                print("query", query.shape)
+                print("key", key.shape)
+                print("query_length", query_length)
+                print("key_length", key_length)
             causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
@@ -369,12 +369,12 @@ def forward(
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        
+
         if self.print_details:
-            print('Attention_______________________________________________')
+            print("Attention_______________________________________________")
         if encoder_hidden_states is not None:
             if self.attention_type != AttentionType.MULTI_HEAD:
-                raise NotImplementedError(f'attention_type {self.attention_type}  for encoder_hidden_states')
+                raise NotImplementedError(f"attention_type {self.attention_type}  for encoder_hidden_states")
 
             if not hasattr(self, "q_attn"):
                 raise ValueError(
@@ -393,10 +393,8 @@ def forward(
                 key, value = self.kv_attn(hidden_states).split(self.head_dim, dim=2)
             else:
                 query, key, value = self.c_attn(hidden_states).split(
-                    (self.num_heads*self.head_dim, self.head_dim, self.head_dim),
-                    dim=2
+                    (self.num_heads * self.head_dim, self.head_dim, self.head_dim), dim=2
                 )
-                
 
         if self.attention_type == AttentionType.MULTI_QUERY_2:
             batch_size, seq_length = query.shape[:2]
@@ -419,11 +417,12 @@ def forward(
         elif self.attention_type == AttentionType.MULTI_QUERY_1:
             batch_size, seq_length = hidden_states.shape[:2]
             query = query.view(
-                batch_size, seq_length, self.num_heads, self.head_dim,
-            ).reshape(
-                batch_size, seq_length * self.num_heads, self.head_dim
-            )
-            key = key.permute(0, 2, 1) # [batch_size, head_dim, seq_length]
+                batch_size,
+                seq_length,
+                self.num_heads,
+                self.head_dim,
+            ).reshape(batch_size, seq_length * self.num_heads, self.head_dim)
+            key = key.permute(0, 2, 1)  # [batch_size, head_dim, seq_length]
             # value [batch_size, seq_length, head_dim]
         elif self.attention_type == AttentionType.MULTI_QUERY:
             query = self._split_heads(query, self.num_heads, self.head_dim)
@@ -437,8 +436,10 @@ def forward(
         if layer_past is not None:
             past_key, past_value = layer_past
             # Concatenate on sequence dimension
-            if (self.attention_type == AttentionType.MULTI_QUERY_1 or
-                self.attention_type == AttentionType.MULTI_QUERY_2):
+            if (
+                self.attention_type == AttentionType.MULTI_QUERY_1
+                or self.attention_type == AttentionType.MULTI_QUERY_2
+            ):
                 key = torch.cat((past_key, key), dim=-1)
             else:
                 key = torch.cat((past_key, key), dim=-2)
@@ -452,7 +453,7 @@ def forward(
 
         if self.reorder_and_upcast_attn:
             # NOTE: exception is raised in __init__ if not multi head attention
-            #raise NotImplementedError("Reorder and upcast attention not implemented for MQA")
+            # raise NotImplementedError("Reorder and upcast attention not implemented for MQA")
             attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
         else:
             attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
@@ -503,7 +504,7 @@ def __init__(self, config, layer_idx=None):
 
         self.mlp = GPT2MLP(inner_dim, config)
 
-        if hasattr(config, 'print_details') and config.print_details is True:
+        if hasattr(config, "print_details") and config.print_details is True:
             self.print_details = layer_idx == 0
         else:
             self.print_details = False
@@ -522,8 +523,8 @@ def forward(
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
         if self.print_details:
-            print('hidden_states, ba', hidden_states.size())
-            print('attention_mask, ba', attention_mask.size())
+            print("hidden_states, ba", hidden_states.size())
+            print("attention_mask, ba", attention_mask.size())
         attn_outputs = self.attn(
             hidden_states,
             layer_past=layer_past,
@@ -822,7 +823,7 @@ def __init__(self, config):
         self.device_map = None
         self.gradient_checkpointing = False
 
-        if hasattr(config, 'print_details') and config.print_details is True:
+        if hasattr(config, "print_details") and config.print_details is True:
             self.print_details = True
         else:
             self.print_details = False
@@ -907,7 +908,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if self.print_details:
-            print('-------------startd forward-----------------------------------------')
+            print("-------------startd forward-----------------------------------------")
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -927,8 +928,8 @@ def forward(
         if position_ids is not None:
             position_ids = position_ids.view(-1, input_shape[-1])
         if self.print_details:
-            print('token_type_ids ', token_type_ids is not None)
-            print('position_ids ', position_ids is not None)
+            print("token_type_ids ", token_type_ids is not None)
+            print("position_ids ", position_ids is not None)
 
         if past_key_values is None:
             past_length = 0
@@ -939,21 +940,21 @@ def forward(
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
             position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
         if self.print_details:
-            print('past_length ', past_length)
-            print('position_ids', position_ids.size(), position_ids)
-            print('past_key_values', len(past_key_values))
+            print("past_length ", past_length)
+            print("position_ids", position_ids.size(), position_ids)
+            print("past_key_values", len(past_key_values))
 
         # GPT2Attention mask.
         if self.print_details:
-            print('attention_mask ', attention_mask is not None)
+            print("attention_mask ", attention_mask is not None)
         if attention_mask is not None:
             if batch_size <= 0:
                 raise ValueError("batch_size has to be defined and > 0")
             if self.print_details:
-                print('attention_mask ', attention_mask.size())
+                print("attention_mask ", attention_mask.size())
             attention_mask = attention_mask.view(batch_size, -1)
             if self.print_details:
-                print('attention_mask ', attention_mask.size())
+                print("attention_mask ", attention_mask.size())
             # We create a 3D attention mask from a 2D tensor mask.
             # Sizes are [batch_size, 1, 1, to_seq_length]
             # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
@@ -961,7 +962,7 @@ def forward(
             # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
             attention_mask = attention_mask[:, None, None, :]
             if self.print_details:
-                print('attention_mask ', attention_mask.size())
+                print("attention_mask ", attention_mask.size())
                 print(attention_mask)
 
             # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
@@ -976,7 +977,7 @@ def forward(
                 print(torch.finfo(self.dtype).min)
 
         if self.print_details:
-            print('encoder_hidden_states ', encoder_hidden_states is not None)
+            print("encoder_hidden_states ", encoder_hidden_states is not None)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.config.add_cross_attention and encoder_hidden_states is not None:
@@ -994,16 +995,16 @@ def forward(
         # head_mask has shape n_layer x batch x n_heads x N x N
         head_mask = self.get_head_mask(head_mask, self.config.n_layer)
         if self.print_details:
-            print('head_mask', len(head_mask))
+            print("head_mask", len(head_mask))
 
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
         position_embeds = self.wpe(position_ids)
         hidden_states = inputs_embeds + position_embeds
         if self.print_details:
-            print('inputs_embeds', inputs_embeds.size())
-            print('position_embeds', position_embeds.size())
-            print('hidden_states', hidden_states.size())
+            print("inputs_embeds", inputs_embeds.size())
+            print("position_embeds", position_embeds.size())
+            print("hidden_states", hidden_states.size())
 
         if token_type_ids is not None:
             token_type_embeds = self.wte(token_type_ids)
@@ -1013,13 +1014,13 @@ def forward(
 
         output_shape = input_shape + (hidden_states.size(-1),)
         if self.print_details:
-            print('output_shape ', output_shape)
-            print('input_shape ', input_shape)
-            print('hidden_states.size ', hidden_states.size())
+            print("output_shape ", output_shape)
+            print("input_shape ", input_shape)
+            print("hidden_states.size ", hidden_states.size())
 
-            print('use_cache', use_cache)
-            print('output_attentions', output_attentions)
-            print('output_hidden_states', output_attentions)
+            print("use_cache", use_cache)
+            print("output_attentions", output_attentions)
+            print("output_hidden_states", output_attentions)
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@@ -1066,15 +1067,21 @@ def custom_forward(*inputs):
                 )
             else:
                 if self.print_details and i == 0:
-                    print('Block .......................................................................')
-                    print('hidden_states', hidden_states.size())
-                    print('attention_mask', attention_mask.size())
-                    print('layer_past', False if layer_past is None else [st.shape for st in layer_past])
-                    print('head_mask[i]', False if head_mask[i] is None else head_mask[i].size())
-                    print('encoder_hidden_states', False if encoder_hidden_states is None else encoder_hidden_states.size())
-                    print('encoder_attention_mask', False if encoder_attention_mask is None else encoder_attention_mask.size())
-                    print('use_cache', use_cache)
-                    print('output_attentions', output_attentions)
+                    print("Block .......................................................................")
+                    print("hidden_states", hidden_states.size())
+                    print("attention_mask", attention_mask.size())
+                    print("layer_past", False if layer_past is None else [st.shape for st in layer_past])
+                    print("head_mask[i]", False if head_mask[i] is None else head_mask[i].size())
+                    print(
+                        "encoder_hidden_states",
+                        False if encoder_hidden_states is None else encoder_hidden_states.size(),
+                    )
+                    print(
+                        "encoder_attention_mask",
+                        False if encoder_attention_mask is None else encoder_attention_mask.size(),
+                    )
+                    print("use_cache", use_cache)
+                    print("output_attentions", output_attentions)
                 outputs = block(
                     hidden_states,
                     layer_past=layer_past,
@@ -1117,7 +1124,7 @@ def custom_forward(*inputs):
                 if v is not None
             )
         if self.print_details:
-            print('-------------finish forward-----------------------------------------')
+            print("-------------finish forward-----------------------------------------")
 
         return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,

From 4dc821cb81e1207ea6447c2003ecf4c63fc3c9e4 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 23 Jan 2023 13:35:02 -0500
Subject: [PATCH 12/29] cleanup

---
 src/transformers/models/gpt2/modeling_gpt2.py | 101 +-----------------
 1 file changed, 1 insertion(+), 100 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index dd1a6de53a..4fc73cde47 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -135,11 +135,6 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
 
         self.attention_type = AttentionType(config.attention_type)
 
-        if hasattr(config, "print_details") and config.print_details is True:
-            self.print_details = layer_idx == 0
-        else:
-            self.print_details = False
-
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
@@ -184,16 +179,6 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
 
         self.pruned_heads = set()
 
-        if self.print_details:
-            print("Attention info________________________________________________")
-            print("max_positions ", max_positions)
-            print("self.embed_dim ", self.embed_dim)
-            print("self.num_heads ", self.num_heads)
-            print("self.head_dim ", self.head_dim)
-            print("self.split_size ", self.split_size)
-            print("self.c_attn", self.c_attn)
-            print("______________________________________________________________")
-
     def prune_heads(self, heads):
         if len(heads) == 0:
             return
@@ -225,17 +210,9 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
             attn_weights = torch.bmm(query, key)
             # -> (b, num_heads, sq, sk)
-            if self.print_details:
-                print("query: ", query.shape)
-                print("key: ", key.shape)
-                print("attn_weights: ", attn_weights.shape)
             attn_weights = attn_weights.view(batch_size, self.num_heads, query_length, key_length)
         else:
             attn_weights = torch.matmul(query, key.transpose(-1, -2))
-            if self.print_details:
-                print("query: ", query.shape)
-                print("key.transpose(-1, -2): ", key.transpose(-1, -2).shape)
-                print("attn_weights: ", attn_weights.shape)
 
         if self.scale_attn_weights:
             attn_weights = attn_weights / value.size(-1) ** 0.5
@@ -248,11 +225,6 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
             # if only "normal" attention layer implements causal mask
             if self.attention_type != AttentionType.MULTI_QUERY_1:
                 query_length, key_length = query.size(-2), key.size(-2)
-            if self.print_details:
-                print("query", query.shape)
-                print("key", key.shape)
-                print("query_length", query_length)
-                print("key_length", key_length)
             causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
@@ -370,8 +342,6 @@ def forward(
         output_attentions: Optional[bool] = False,
     ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
 
-        if self.print_details:
-            print("Attention_______________________________________________")
         if encoder_hidden_states is not None:
             if self.attention_type != AttentionType.MULTI_HEAD:
                 raise NotImplementedError(f"attention_type {self.attention_type}  for encoder_hidden_states")
@@ -504,11 +474,6 @@ def __init__(self, config, layer_idx=None):
 
         self.mlp = GPT2MLP(inner_dim, config)
 
-        if hasattr(config, "print_details") and config.print_details is True:
-            self.print_details = layer_idx == 0
-        else:
-            self.print_details = False
-
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
@@ -522,9 +487,6 @@ def forward(
     ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
-        if self.print_details:
-            print("hidden_states, ba", hidden_states.size())
-            print("attention_mask, ba", attention_mask.size())
         attn_outputs = self.attn(
             hidden_states,
             layer_past=layer_past,
@@ -823,11 +785,6 @@ def __init__(self, config):
         self.device_map = None
         self.gradient_checkpointing = False
 
-        if hasattr(config, "print_details") and config.print_details is True:
-            self.print_details = True
-        else:
-            self.print_details = False
-
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -907,8 +864,6 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if self.print_details:
-            print("-------------startd forward-----------------------------------------")
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -927,9 +882,6 @@ def forward(
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
         if position_ids is not None:
             position_ids = position_ids.view(-1, input_shape[-1])
-        if self.print_details:
-            print("token_type_ids ", token_type_ids is not None)
-            print("position_ids ", position_ids is not None)
 
         if past_key_values is None:
             past_length = 0
@@ -939,31 +891,18 @@ def forward(
         if position_ids is None:
             position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
             position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-        if self.print_details:
-            print("past_length ", past_length)
-            print("position_ids", position_ids.size(), position_ids)
-            print("past_key_values", len(past_key_values))
 
         # GPT2Attention mask.
-        if self.print_details:
-            print("attention_mask ", attention_mask is not None)
         if attention_mask is not None:
             if batch_size <= 0:
                 raise ValueError("batch_size has to be defined and > 0")
-            if self.print_details:
-                print("attention_mask ", attention_mask.size())
             attention_mask = attention_mask.view(batch_size, -1)
-            if self.print_details:
-                print("attention_mask ", attention_mask.size())
             # We create a 3D attention mask from a 2D tensor mask.
             # Sizes are [batch_size, 1, 1, to_seq_length]
             # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
             # this attention mask is more simple than the triangular masking of causal attention
             # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
             attention_mask = attention_mask[:, None, None, :]
-            if self.print_details:
-                print("attention_mask ", attention_mask.size())
-                print(attention_mask)
 
             # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
             # masked positions, this operation will create a tensor which is 0.0 for
@@ -972,12 +911,7 @@ def forward(
             # effectively the same as removing these entirely.
             attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
             attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-            if self.print_details:
-                print(attention_mask)
-                print(torch.finfo(self.dtype).min)
 
-        if self.print_details:
-            print("encoder_hidden_states ", encoder_hidden_states is not None)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if self.config.add_cross_attention and encoder_hidden_states is not None:
@@ -994,17 +928,11 @@ def forward(
         # attention_probs has shape bsz x n_heads x N x N
         # head_mask has shape n_layer x batch x n_heads x N x N
         head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-        if self.print_details:
-            print("head_mask", len(head_mask))
 
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
         position_embeds = self.wpe(position_ids)
         hidden_states = inputs_embeds + position_embeds
-        if self.print_details:
-            print("inputs_embeds", inputs_embeds.size())
-            print("position_embeds", position_embeds.size())
-            print("hidden_states", hidden_states.size())
 
         if token_type_ids is not None:
             token_type_embeds = self.wte(token_type_ids)
@@ -1013,14 +941,7 @@ def forward(
         hidden_states = self.drop(hidden_states)
 
         output_shape = input_shape + (hidden_states.size(-1),)
-        if self.print_details:
-            print("output_shape ", output_shape)
-            print("input_shape ", input_shape)
-            print("hidden_states.size ", hidden_states.size())
-
-            print("use_cache", use_cache)
-            print("output_attentions", output_attentions)
-            print("output_hidden_states", output_attentions)
+
         presents = () if use_cache else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@@ -1066,22 +987,6 @@ def custom_forward(*inputs):
                     encoder_attention_mask,
                 )
             else:
-                if self.print_details and i == 0:
-                    print("Block .......................................................................")
-                    print("hidden_states", hidden_states.size())
-                    print("attention_mask", attention_mask.size())
-                    print("layer_past", False if layer_past is None else [st.shape for st in layer_past])
-                    print("head_mask[i]", False if head_mask[i] is None else head_mask[i].size())
-                    print(
-                        "encoder_hidden_states",
-                        False if encoder_hidden_states is None else encoder_hidden_states.size(),
-                    )
-                    print(
-                        "encoder_attention_mask",
-                        False if encoder_attention_mask is None else encoder_attention_mask.size(),
-                    )
-                    print("use_cache", use_cache)
-                    print("output_attentions", output_attentions)
                 outputs = block(
                     hidden_states,
                     layer_past=layer_past,
@@ -1092,8 +997,6 @@ def custom_forward(*inputs):
                     use_cache=use_cache,
                     output_attentions=output_attentions,
                 )
-                if self.print_details and i == 0:
-                    print(len(outputs))
 
             hidden_states = outputs[0]
             if use_cache is True:
@@ -1123,8 +1026,6 @@ def custom_forward(*inputs):
                 for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
                 if v is not None
             )
-        if self.print_details:
-            print("-------------finish forward-----------------------------------------")
 
         return BaseModelOutputWithPastAndCrossAttentions(
             last_hidden_state=hidden_states,

From 129e8c9a7592cfb40f5c7d6e5902f96275574830 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 23 Jan 2023 13:42:22 -0500
Subject: [PATCH 13/29] cleanup

---
 src/transformers/models/gpt2/modeling_gpt2.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 4fc73cde47..dd67aa38fb 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -163,14 +163,13 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
             self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
             self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
         else:
-            if self.attention_type != AttentionType.MULTI_HEAD:
-                self.c_attn = Conv1D((self.num_heads + 2) * self.head_dim, self.embed_dim)
-            elif self.attention_type != AttentionType.MULTI_QUERY_2:
+            if self.attention_type == AttentionType.MULTI_QUERY_2:
                 self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
                 # Keys and values are shared across heads
                 self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
             else:
-                self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+                k_dim=self.embed_dim if self.attention_type == AttentionType.MULTI_HEAD else self.head_dim
+                self.c_attn = Conv1D(self.embed_dim+2*k_dim, self.embed_dim)
 
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
 

From 303e1b8f33d793e85ccf172fece7a54584c4c528 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 23 Jan 2023 14:21:51 -0500
Subject: [PATCH 14/29] cleanup

---
 src/transformers/models/gpt2/modeling_gpt2.py | 21 ++++++-------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index dd67aa38fb..02e74b0cca 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -136,10 +136,10 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.attention_type = AttentionType(config.attention_type)
 
         self.embed_dim = config.hidden_size
+        self.kv_dim=self.embed_dim if self.attention_type == AttentionType.MULTI_HEAD else self.head_dim
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
         self.split_size = self.embed_dim
-
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
                 f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
@@ -166,10 +166,9 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
             if self.attention_type == AttentionType.MULTI_QUERY_2:
                 self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
                 # Keys and values are shared across heads
-                self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
+                self.kv_attn = Conv1D(2 * self.kv_dim, self.embed_dim)
             else:
-                k_dim=self.embed_dim if self.attention_type == AttentionType.MULTI_HEAD else self.head_dim
-                self.c_attn = Conv1D(self.embed_dim+2*k_dim, self.embed_dim)
+                self.c_attn = Conv1D(self.embed_dim+2*self.kv_dim, self.embed_dim)
 
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
 
@@ -340,12 +339,8 @@ def forward(
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-
         if encoder_hidden_states is not None:
-            if self.attention_type != AttentionType.MULTI_HEAD:
-                raise NotImplementedError(f"attention_type {self.attention_type}  for encoder_hidden_states")
-
-            if not hasattr(self, "q_attn"):
+            if not hasattr(self, "q_attn") or not self.is_cross_attention:
                 raise ValueError(
                     "If class is used as cross attention, the weights `q_attn` have to be defined. "
                     "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
@@ -355,15 +350,11 @@ def forward(
             key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
             attention_mask = encoder_attention_mask
         else:
-            if self.attention_type == AttentionType.MULTI_HEAD:
-                query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-            elif self.attention_type == AttentionType.MULTI_QUERY_2:
+            if self.attention_type == AttentionType.MULTI_QUERY_2:
                 query = self.q_attn(hidden_states)
                 key, value = self.kv_attn(hidden_states).split(self.head_dim, dim=2)
             else:
-                query, key, value = self.c_attn(hidden_states).split(
-                    (self.num_heads * self.head_dim, self.head_dim, self.head_dim), dim=2
-                )
+                query, key, value = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
 
         if self.attention_type == AttentionType.MULTI_QUERY_2:
             batch_size, seq_length = query.shape[:2]

From d0b58e9d8473e8bf7201f7da8c97cf2046d76170 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 23 Jan 2023 14:45:49 -0500
Subject: [PATCH 15/29] cleanup

---
 src/transformers/models/gpt2/modeling_gpt2.py | 63 ++++++-------------
 1 file changed, 18 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 02e74b0cca..8db2607bf5 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -138,6 +138,7 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.embed_dim = config.hidden_size
         self.kv_dim=self.embed_dim if self.attention_type == AttentionType.MULTI_HEAD else self.head_dim
         self.num_heads = config.num_attention_heads
+        self.num_kv_heads=self.num_heads if self.attention_type == AttentionType.MULTI_HEAD else 1
         self.head_dim = self.embed_dim // self.num_heads
         self.split_size = self.embed_dim
         if self.head_dim * self.num_heads != self.embed_dim:
@@ -193,17 +194,13 @@ def prune_heads(self, heads):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        batch_size = query.size(0)
         if self.attention_type == AttentionType.MULTI_QUERY_1 or self.attention_type == AttentionType.MULTI_QUERY_2:
             # query: (b, num_heads * sq, head_dim)
             # key: (b, head_dim, sk)
             # value: (b, sk, head_dim)
-            # NOTE: which one is more canonical?
-            batch_size = query.shape[0]
-            query_length = query.shape[1] // self.num_heads
-            key_length = key.shape[-1]
-            # batch_size = query.size(0)
-            # query_length = query.size(1) // self.num_heads
-            # key_length = key.size(2)
+            query_length = query.size(1) // self.num_heads
+            key_length = key.size(2)
 
             # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
             attn_weights = torch.bmm(query, key)
@@ -244,17 +241,15 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
 
-        if self.attention_type == AttentionType.MULTI_QUERY_1:
-            # NOTE: is here ouptud dim order of attn_weights is messed up
-            attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
-            attn_output = torch.bmm(attn_weights, value)
-            attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
-        elif self.attention_type == AttentionType.MULTI_QUERY_2:
+        if self.attention_type == AttentionType.MULTI_QUERY_1 or self.attention_type == AttentionType.MULTI_QUERY_2:
             # (b, num_heads, sq, sk) -> (b, num_heads * sq, sk)
             _attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
             # (b, num_heads * sq, sk) x (b, sk, head_dim) -> (b, num_heads * sq, head_dim)
             attn_output = torch.bmm(_attn_weights, value)
             attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+            if self.attention_type == AttentionType.MULTI_QUERY_1:
+                # TODO: Why?
+                attn_weights=_attn_weights
         else:
             attn_output = torch.matmul(attn_weights, value)
 
@@ -352,46 +347,24 @@ def forward(
         else:
             if self.attention_type == AttentionType.MULTI_QUERY_2:
                 query = self.q_attn(hidden_states)
-                key, value = self.kv_attn(hidden_states).split(self.head_dim, dim=2)
+                key, value = self.kv_attn(hidden_states).split((self.kv_dim, self.kv_dim), dim=2)
             else:
                 query, key, value = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
 
-        if self.attention_type == AttentionType.MULTI_QUERY_2:
+        if self.attention_type == AttentionType.MULTI_QUERY_1 or self.attention_type == AttentionType.MULTI_QUERY_2:
             batch_size, seq_length = query.shape[:2]
-            # (query_length, batch, num_heads, head_dim)
-            # (batch, num_heads * query_length, head_dim)\
-
-            # (batch, query_length, hidden_size) -> (batch, num_heads, query_length, head_dim)
-            query = query.view(batch_size, seq_length, self.num_heads, self.head_dim).permute([0, 2, 1, 3])
-            # -> (batch, num_heads * query_length, head_dim)
-            query = query.reshape(batch_size, self.num_heads * seq_length, self.head_dim)
-
-            # (batch, query_length, hidden_size) -> (batch, query_length * num_heads, head_dim)
-            # query = query.view(
-            #     batch_size, seq_length, self.num_heads, self.head_dim,
-            # ).reshape(
-            #     batch_size, seq_length * self.num_heads, self.head_dim
-            # )
-            key = key.permute(0, 2, 1)  # (batch_size, head_dim, seq_length)
-            # value (batch_size, seq_length, head_dim
-        elif self.attention_type == AttentionType.MULTI_QUERY_1:
-            batch_size, seq_length = hidden_states.shape[:2]
-            query = query.view(
-                batch_size,
-                seq_length,
-                self.num_heads,
-                self.head_dim,
-            ).reshape(batch_size, seq_length * self.num_heads, self.head_dim)
+
+            query = query.view(batch_size, seq_length, self.num_heads, self.head_dim)
+            if self.attention_type == AttentionType.MULTI_QUERY_2:
+                query = query.permute([0, 2, 1, 3])
+            query = query.reshape(batch_size, seq_length * self.num_heads, self.head_dim)
+
             key = key.permute(0, 2, 1)  # [batch_size, head_dim, seq_length]
             # value [batch_size, seq_length, head_dim]
-        elif self.attention_type == AttentionType.MULTI_QUERY:
-            query = self._split_heads(query, self.num_heads, self.head_dim)
-            key = self._split_heads(key, 1, self.head_dim)
-            value = self._split_heads(value, 1, self.head_dim)
         else:
             query = self._split_heads(query, self.num_heads, self.head_dim)
-            key = self._split_heads(key, self.num_heads, self.head_dim)
-            value = self._split_heads(value, self.num_heads, self.head_dim)
+            key = self._split_heads(key, self.num_kv_heads, self.head_dim)
+            value = self._split_heads(value, self.num_kv_heads, self.head_dim)
 
         if layer_past is not None:
             past_key, past_value = layer_past

From a1e91828d7d716a15596dca338d84d60e955fad4 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 23 Jan 2023 16:04:06 -0500
Subject: [PATCH 16/29] Fixes and cleanup

---
 src/transformers/models/gpt2/modeling_gpt2.py | 66 +++++++------------
 1 file changed, 24 insertions(+), 42 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 8db2607bf5..b1ad4345a7 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -194,20 +194,15 @@ def prune_heads(self, heads):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        batch_size = query.size(0)
-        if self.attention_type == AttentionType.MULTI_QUERY_1 or self.attention_type == AttentionType.MULTI_QUERY_2:
-            # query: (b, num_heads * sq, head_dim)
-            # key: (b, head_dim, sk)
-            # value: (b, sk, head_dim)
-            query_length = query.size(1) // self.num_heads
-            key_length = key.size(2)
-
-            # (b, num_heads * sq, head_dim) x (b, head_dim, sk) -> (b, num_heads * sq, sk)
-            attn_weights = torch.bmm(query, key)
-            # -> (b, num_heads, sq, sk)
-            attn_weights = attn_weights.view(batch_size, self.num_heads, query_length, key_length)
-        else:
-            attn_weights = torch.matmul(query, key.transpose(-1, -2))
+        # query: (b, nh, sq, hs)
+        # key: (b, kv_nh, sk, hs)
+        # value: (b, kv_nh, sk, hs)
+
+        query_length = query.size(2)
+        key_length = key.size(2)
+
+        # (b, nh, sq, hs) x (b, kv_nh, hs, sk) -> (b, nh, sq, sk)
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
 
         if self.scale_attn_weights:
             attn_weights = attn_weights / value.size(-1) ** 0.5
@@ -218,8 +213,6 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
 
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
-            if self.attention_type != AttentionType.MULTI_QUERY_1:
-                query_length, key_length = query.size(-2), key.size(-2)
             causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
@@ -241,17 +234,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
 
-        if self.attention_type == AttentionType.MULTI_QUERY_1 or self.attention_type == AttentionType.MULTI_QUERY_2:
-            # (b, num_heads, sq, sk) -> (b, num_heads * sq, sk)
-            _attn_weights = attn_weights.view(batch_size, self.num_heads * query_length, key_length)
-            # (b, num_heads * sq, sk) x (b, sk, head_dim) -> (b, num_heads * sq, head_dim)
-            attn_output = torch.bmm(_attn_weights, value)
-            attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
-            if self.attention_type == AttentionType.MULTI_QUERY_1:
-                # TODO: Why?
-                attn_weights=_attn_weights
-        else:
-            attn_output = torch.matmul(attn_weights, value)
+        attn_output = torch.matmul(attn_weights, value)
 
         return attn_output, attn_weights
 
@@ -345,26 +328,25 @@ def forward(
             key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
             attention_mask = encoder_attention_mask
         else:
+            # hidden_states: (b, sq, embed)
             if self.attention_type == AttentionType.MULTI_QUERY_2:
+                # (b, sq, embed) x (embed, nh*hs) -> (b, sq,  nh*hs)
                 query = self.q_attn(hidden_states)
+                # (b, sq, embed) x (embed, 2*kv) -> (b, sq, 2*kv) -> (b, sq, kv) + (b, sq, kv)
                 key, value = self.kv_attn(hidden_states).split((self.kv_dim, self.kv_dim), dim=2)
             else:
+                # (b, sq, embed) x (embed, nh*hs + 2*kv) -> (b, sq,  nh*hs + 2*kv) -> (b, sq,  nh*hs) + (b, sq, kv) + (b, sq, kv)
                 query, key, value = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
-
-        if self.attention_type == AttentionType.MULTI_QUERY_1 or self.attention_type == AttentionType.MULTI_QUERY_2:
-            batch_size, seq_length = query.shape[:2]
-
-            query = query.view(batch_size, seq_length, self.num_heads, self.head_dim)
-            if self.attention_type == AttentionType.MULTI_QUERY_2:
-                query = query.permute([0, 2, 1, 3])
-            query = query.reshape(batch_size, seq_length * self.num_heads, self.head_dim)
-
-            key = key.permute(0, 2, 1)  # [batch_size, head_dim, seq_length]
-            # value [batch_size, seq_length, head_dim]
-        else:
-            query = self._split_heads(query, self.num_heads, self.head_dim)
-            key = self._split_heads(key, self.num_kv_heads, self.head_dim)
-            value = self._split_heads(value, self.num_kv_heads, self.head_dim)
+            # query: (b, sq,  nh*hs)
+            # key: (b, sk, kv), sk == sq
+            # value: (b, sk, kv)
+
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_kv_heads, self.head_dim)
+        value = self._split_heads(value, self.num_kv_heads, self.head_dim)
+        # query: (b, nh, sq, hs)
+        # key: (b, kv_nh, sk, hs)
+        # value: (b, kv_nh, sk, hs)
 
         if layer_past is not None:
             past_key, past_value = layer_past

From a57ca7aa6c045794b723dc6d1637219aa8032271 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 23 Jan 2023 16:07:17 -0500
Subject: [PATCH 17/29] Fixes and cleanup

---
 src/transformers/models/gpt2/modeling_gpt2.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index b1ad4345a7..cc110db5b3 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -194,13 +194,6 @@ def prune_heads(self, heads):
         self.pruned_heads = self.pruned_heads.union(heads)
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # query: (b, nh, sq, hs)
-        # key: (b, kv_nh, sk, hs)
-        # value: (b, kv_nh, sk, hs)
-
-        query_length = query.size(2)
-        key_length = key.size(2)
-
         # (b, nh, sq, hs) x (b, kv_nh, hs, sk) -> (b, nh, sq, sk)
         attn_weights = torch.matmul(query, key.transpose(-1, -2))
 
@@ -213,6 +206,7 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
 
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
             causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.

From e152e942d6ba93bb0d875be9565c9dc46b0407a8 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 23 Jan 2023 16:21:40 -0500
Subject: [PATCH 18/29] Fixes and merge implementations

---
 .../models/gpt2/configuration_gpt2.py         |  5 ++---
 src/transformers/models/gpt2/modeling_gpt2.py | 20 +------------------
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index 41780463c2..141e8d4629 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -38,9 +38,8 @@
 
 class AttentionType(Enum):
     MULTI_HEAD = 1
-    MULTI_QUERY = 2
-    MULTI_QUERY_1 = 3
-    MULTI_QUERY_2 = 4
+    MULTI_QUERY_1 = 2
+    MULTI_QUERY_2 = 3
 
 
 class GPT2Config(PretrainedConfig):
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index cc110db5b3..8538450bef 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -322,37 +322,19 @@ def forward(
             key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
             attention_mask = encoder_attention_mask
         else:
-            # hidden_states: (b, sq, embed)
             if self.attention_type == AttentionType.MULTI_QUERY_2:
-                # (b, sq, embed) x (embed, nh*hs) -> (b, sq,  nh*hs)
                 query = self.q_attn(hidden_states)
-                # (b, sq, embed) x (embed, 2*kv) -> (b, sq, 2*kv) -> (b, sq, kv) + (b, sq, kv)
                 key, value = self.kv_attn(hidden_states).split((self.kv_dim, self.kv_dim), dim=2)
             else:
-                # (b, sq, embed) x (embed, nh*hs + 2*kv) -> (b, sq,  nh*hs + 2*kv) -> (b, sq,  nh*hs) + (b, sq, kv) + (b, sq, kv)
                 query, key, value = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
-            # query: (b, sq,  nh*hs)
-            # key: (b, sk, kv), sk == sq
-            # value: (b, sk, kv)
 
         query = self._split_heads(query, self.num_heads, self.head_dim)
         key = self._split_heads(key, self.num_kv_heads, self.head_dim)
         value = self._split_heads(value, self.num_kv_heads, self.head_dim)
-        # query: (b, nh, sq, hs)
-        # key: (b, kv_nh, sk, hs)
-        # value: (b, kv_nh, sk, hs)
 
         if layer_past is not None:
             past_key, past_value = layer_past
-            # Concatenate on sequence dimension
-            if (
-                self.attention_type == AttentionType.MULTI_QUERY_1
-                or self.attention_type == AttentionType.MULTI_QUERY_2
-            ):
-                key = torch.cat((past_key, key), dim=-1)
-            else:
-                key = torch.cat((past_key, key), dim=-2)
-
+            key = torch.cat((past_key, key), dim=-2)
             value = torch.cat((past_value, value), dim=-2)
 
         if use_cache is True:

From 2e32a95ee7c81198827444f578e104d45bdf186b Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 24 Jan 2023 16:05:29 -0500
Subject: [PATCH 19/29] Fixes and improvements

---
 .../models/gpt2/configuration_gpt2.py         |   2 +-
 src/transformers/models/gpt2/modeling_gpt2.py | 154 +++++++++---------
 2 files changed, 75 insertions(+), 81 deletions(-)

diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index 141e8d4629..fb5a3422e6 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -199,7 +199,7 @@ def __init__(
         self.eos_token_id = eos_token_id
 
         # Convert to an int so it's JSON-serializable.
-        self.attention_type = int(AttentionType(attention_type))
+        self.attention_type = AttentionType(attention_type).value
 
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 8538450bef..1d468ff2b4 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -134,11 +134,10 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.register_buffer("masked_bias", torch.tensor(-1e4))
 
         self.attention_type = AttentionType(config.attention_type)
+        self.is_mqa = self.attention_type != AttentionType.MULTI_HEAD
 
         self.embed_dim = config.hidden_size
-        self.kv_dim=self.embed_dim if self.attention_type == AttentionType.MULTI_HEAD else self.head_dim
         self.num_heads = config.num_attention_heads
-        self.num_kv_heads=self.num_heads if self.attention_type == AttentionType.MULTI_HEAD else 1
         self.head_dim = self.embed_dim // self.num_heads
         self.split_size = self.embed_dim
         if self.head_dim * self.num_heads != self.embed_dim:
@@ -154,11 +153,9 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
         self.layer_idx = layer_idx
         self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
-        if self.reorder_and_upcast_attn and self.attention_type != AttentionType.MULTI_HEAD:
-            raise NotImplementedError(f"attention_type {self.attention_type} for reorder_and_upcast_attn")
 
         if self.is_cross_attention:
-            if self.attention_type != AttentionType.MULTI_HEAD:
+            if self.is_mqa:
                 raise NotImplementedError(f"attention_type {self.attention_type}  for cross_attention")
 
             self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
@@ -167,9 +164,10 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
             if self.attention_type == AttentionType.MULTI_QUERY_2:
                 self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
                 # Keys and values are shared across heads
-                self.kv_attn = Conv1D(2 * self.kv_dim, self.embed_dim)
+                self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
             else:
-                self.c_attn = Conv1D(self.embed_dim+2*self.kv_dim, self.embed_dim)
+                c_dim = self.embed_dim + 2 * (self.embed_dim if self.is_mqa else self.head_dim)
+                self.c_attn = Conv1D(c_dim, self.embed_dim)
 
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
 
@@ -193,75 +191,70 @@ def prune_heads(self, heads):
         self.num_heads = self.num_heads - len(heads)
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # (b, nh, sq, hs) x (b, kv_nh, hs, sk) -> (b, nh, sq, sk)
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / value.size(-1) ** 0.5
-
-        # Layer-wise attention scaling
-        if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-
-        if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
-        bsz, num_heads, q_seq_len, dk = query.size()
-        _, _, k_seq_len, _ = key.size()
-
-        # Preallocate attn_weights for `baddbmm`
-        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+    def _matmul(self, x, y):
+        if self.is_mqa:
+            # Q x K: (b, sq, nh, hs) x (b, hs, sk) -> (b, sq, nh, sk)
+            # A X V: (b, sq, nh, sk) x (b, sk, hs) -> (b, sq, nh, hs)
+            return torch.matmul(x.view(x.size(0), -1, x.size(-1)), y).view(*x.shape[:-1], y.shape[-1])
+        else:
+            # Q x K: (b, nh, sq, hs) x (b, nh, hs, sk) -> (b, nh, sq, sk)
+            # A X V: (b, nh, sq, sk) x (b, nh, sk, hs) -> (b, nh, sq, hs)
+            return torch.matmul(x, y)
+
+    def _matmul_scaled(self, x, y, dtype, scale_factor=1.0):
+        if scale_factor == 1.0:
+            return self._matmul(x, y)
+        output_shape = (*x.shape[:-1], y.size(-1))
+        if self.is_mqa:
+            # Q x K: (b, sq, nh, hs) x (b, hs, sk) -> (b, sq, nh, sk)
+            output_view = (x.size(0), x.size(1) * x.size(2), y.size(-1))
+            z = torch.empty(output_view, dtype=dtype, device=x.device)
+            z = torch.baddbmm(
+                z,
+                x.view(*output_view[:-1], x.size(-1)),
+                y,
+                beta=0,
+                alpha=scale_factor,
+            )
+        else:
+            # Q x K: (b, nh, sq, hs) x (b, nh, hs, sk) -> (b, nh, sq, sk)
+            output_view = (x.size(0) * x.size(1), x.size(2), y.size(-1))
+            z = torch.empty(output_view, dtype=dtype, device=x.device)
+            z = torch.baddbmm(
+                z,
+                x.view(output_view[0], *x.shape[2:]),
+                y.view(output_view[0], *y.shape[2:]),
+                beta=0,
+                alpha=scale_factor,
+            )
+        return z.view(output_shape)
 
-        # Compute Scale Factor
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=False):
         scale_factor = 1.0
         if self.scale_attn_weights:
-            scale_factor /= float(value.size(-1)) ** 0.5
+            scale_factor /= value.size(-1) ** 0.5
 
         if self.scale_attn_by_inverse_layer_idx:
-            scale_factor /= float(self.layer_idx + 1)
+            scale_factor /= self.layer_idx + 1
 
-        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
         with autocast(enabled=False):
-            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
-            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
-            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+            attn_weights = self._matmul_scaled(
+                query, key.transpose(-1, -2), dtype=torch.float32 if upcast else query.dtype, scale_factor=scale_factor
+            )
 
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
             query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            if self.is_mqa:
+                # (b, sq, nh, sk)
+                causal_mask = self.bias[:, key_length - query_length : key_length, :, :key_length].to(torch.bool)
+            else:
+                # (b, nh, sq, sk)
+                causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
             # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
             attn_weights = torch.where(causal_mask, attn_weights, mask_value)
 
         if attention_mask is not None:
@@ -270,8 +263,8 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
-        if attn_weights.dtype != torch.float32:
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        if upcast and attn_weights.dtype != torch.float32:
             raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
         attn_weights = attn_weights.type(value.dtype)
         attn_weights = self.attn_dropout(attn_weights)
@@ -280,23 +273,26 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
 
-        attn_output = torch.matmul(attn_weights, value)
+        attn_output = self._matmul(attn_weights, value)
 
         return attn_output, attn_weights
 
-    def _split_heads(self, tensor, num_heads, attn_head_size):
+    def _split_heads(self, tensor, num_heads, attn_head_size, permute=True):
         """
         Splits hidden_size dim into attn_head_size and num_heads
         """
         new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
         tensor = tensor.view(new_shape)
-        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+        if permute:
+            tensor = tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+        return tensor
 
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
+    def _merge_heads(self, tensor, num_heads, attn_head_size, permute=True):
         """
         Merges attn_head_size dim and num_attn_heads dim into hidden_size
         """
-        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        if permute:
+            tensor = tensor.permute(0, 2, 1, 3).contiguous()
         new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
         return tensor.view(new_shape)
 
@@ -310,7 +306,7 @@ def forward(
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], ...]:
         if encoder_hidden_states is not None:
             if not hasattr(self, "q_attn") or not self.is_cross_attention:
                 raise ValueError(
@@ -328,9 +324,10 @@ def forward(
             else:
                 query, key, value = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
 
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_kv_heads, self.head_dim)
-        value = self._split_heads(value, self.num_kv_heads, self.head_dim)
+        query = self._split_heads(query, self.num_heads, self.head_dim, permute=not self.is_mqa)
+        if not self.is_mqa:
+            key = self._split_heads(key, self.num_heads, self.head_dim)
+            value = self._split_heads(value, self.num_heads, self.head_dim)
 
         if layer_past is not None:
             past_key, past_value = layer_past
@@ -342,14 +339,11 @@ def forward(
         else:
             present = None
 
-        if self.reorder_and_upcast_attn:
-            # NOTE: exception is raised in __init__ if not multi head attention
-            # raise NotImplementedError("Reorder and upcast attention not implemented for MQA")
-            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
-        else:
-            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        attn_output, attn_weights = self._attn(
+            query, key, value, attention_mask, head_mask, upcast=self.reorder_and_upcast_attn
+        )
 
-        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim, permute=not self.is_mqa)
         attn_output = self.c_proj(attn_output)
         attn_output = self.resid_dropout(attn_output)
 

From 93b42d25bcd395caf339f880ae758664e00b5be6 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 24 Jan 2023 16:27:27 -0500
Subject: [PATCH 20/29] simplify and fix

---
 src/transformers/models/gpt2/modeling_gpt2.py | 49 +++++++------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 1d468ff2b4..d3affe9bf9 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -191,42 +191,25 @@ def prune_heads(self, heads):
         self.num_heads = self.num_heads - len(heads)
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def _matmul(self, x, y):
+    def _matmul(self, x, y, dtype=None, scale_factor=1.0):
+        output_shape = (*x.size()[:-1], y.size(-1))
         if self.is_mqa:
             # Q x K: (b, sq, nh, hs) x (b, hs, sk) -> (b, sq, nh, sk)
             # A X V: (b, sq, nh, sk) x (b, sk, hs) -> (b, sq, nh, hs)
-            return torch.matmul(x.view(x.size(0), -1, x.size(-1)), y).view(*x.shape[:-1], y.shape[-1])
-        else:
-            # Q x K: (b, nh, sq, hs) x (b, nh, hs, sk) -> (b, nh, sq, sk)
-            # A X V: (b, nh, sq, sk) x (b, nh, sk, hs) -> (b, nh, sq, hs)
-            return torch.matmul(x, y)
-
-    def _matmul_scaled(self, x, y, dtype, scale_factor=1.0):
-        if scale_factor == 1.0:
-            return self._matmul(x, y)
-        output_shape = (*x.shape[:-1], y.size(-1))
-        if self.is_mqa:
-            # Q x K: (b, sq, nh, hs) x (b, hs, sk) -> (b, sq, nh, sk)
             output_view = (x.size(0), x.size(1) * x.size(2), y.size(-1))
-            z = torch.empty(output_view, dtype=dtype, device=x.device)
-            z = torch.baddbmm(
-                z,
-                x.view(*output_view[:-1], x.size(-1)),
-                y,
-                beta=0,
-                alpha=scale_factor,
-            )
+            x=x.view(*output_view[:-1], x.size(-1))
         else:
             # Q x K: (b, nh, sq, hs) x (b, nh, hs, sk) -> (b, nh, sq, sk)
+            # A X V: (b, nh, sq, sk) x (b, nh, sk, hs) -> (b, nh, sq, hs)
             output_view = (x.size(0) * x.size(1), x.size(2), y.size(-1))
-            z = torch.empty(output_view, dtype=dtype, device=x.device)
-            z = torch.baddbmm(
-                z,
-                x.view(output_view[0], *x.shape[2:]),
-                y.view(output_view[0], *y.shape[2:]),
-                beta=0,
-                alpha=scale_factor,
-            )
+            x=x.view(output_view[0], *x.size()[2:])
+            y=y.view(output_view[0], *y.size()[2:])
+        if scale_factor == 1.0 and dtype is None:
+            # TODO: Is baddbmm identical?
+            z=torch.matmul(x, y)
+        else:
+            z = torch.empty(output_view, dtype=x.dtype if dtype is None else dtype, device=x.device)
+            z = torch.baddbmm(z,x,y,beta=0,alpha=scale_factor)
         return z.view(output_shape)
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=False):
@@ -238,18 +221,20 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=F
             scale_factor /= self.layer_idx + 1
 
         with autocast(enabled=False):
-            attn_weights = self._matmul_scaled(
-                query, key.transpose(-1, -2), dtype=torch.float32 if upcast else query.dtype, scale_factor=scale_factor
+            attn_weights = self._matmul(
+                query, key.transpose(-1, -2), dtype=torch.float32 if upcast else None, scale_factor=scale_factor
             )
 
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
+            key_length = key.size(-2)
             if self.is_mqa:
                 # (b, sq, nh, sk)
+                query_length = query.size(1)
                 causal_mask = self.bias[:, key_length - query_length : key_length, :, :key_length].to(torch.bool)
             else:
                 # (b, nh, sq, sk)
+                query_length = query.size(-2)
                 causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
             mask_value = torch.finfo(attn_weights.dtype).min
             # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.

From 82b11dff0dfd1ac9f579d72121a16d7b4f1a5503 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 25 Jan 2023 17:08:19 -0500
Subject: [PATCH 21/29] Fixes, optimization and comments

---
 src/transformers/models/gpt2/modeling_gpt2.py | 54 ++++++++++---------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index d3affe9bf9..0d6850da9a 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -123,15 +123,14 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
 class GPT2Attention(nn.Module):
     def __init__(self, config, is_cross_attention=False, layer_idx=None):
         super().__init__()
-
         max_positions = config.max_position_embeddings
         self.register_buffer(
-            "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
-                1, 1, max_positions, max_positions
-            ),
+            "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+        # We don't use a buffer because the mask value depends on the dtype,
+        # And the dtype will be different if upcasting.
+        self.mask_value = None
 
         self.attention_type = AttentionType(config.attention_type)
         self.is_mqa = self.attention_type != AttentionType.MULTI_HEAD
@@ -139,6 +138,7 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
+        self.kv_dim = self.embed_dim if self.attention_type == AttentionType.MULTI_HEAD else self.head_dim
         self.split_size = self.embed_dim
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
@@ -166,8 +166,7 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
                 # Keys and values are shared across heads
                 self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
             else:
-                c_dim = self.embed_dim + 2 * (self.embed_dim if self.is_mqa else self.head_dim)
-                self.c_attn = Conv1D(c_dim, self.embed_dim)
+                self.c_attn = Conv1D(self.embed_dim + 2 * self.kv_dim, self.embed_dim)
 
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
 
@@ -197,19 +196,19 @@ def _matmul(self, x, y, dtype=None, scale_factor=1.0):
             # Q x K: (b, sq, nh, hs) x (b, hs, sk) -> (b, sq, nh, sk)
             # A X V: (b, sq, nh, sk) x (b, sk, hs) -> (b, sq, nh, hs)
             output_view = (x.size(0), x.size(1) * x.size(2), y.size(-1))
-            x=x.view(*output_view[:-1], x.size(-1))
+            # No copy needed for MQA 2, or when layer_past is provided.
+            x = x.reshape(*output_view[:-1], x.size(-1))
         else:
             # Q x K: (b, nh, sq, hs) x (b, nh, hs, sk) -> (b, nh, sq, sk)
             # A X V: (b, nh, sq, sk) x (b, nh, sk, hs) -> (b, nh, sq, hs)
             output_view = (x.size(0) * x.size(1), x.size(2), y.size(-1))
-            x=x.view(output_view[0], *x.size()[2:])
-            y=y.view(output_view[0], *y.size()[2:])
-        if scale_factor == 1.0 and dtype is None:
-            # TODO: Is baddbmm identical?
-            z=torch.matmul(x, y)
-        else:
-            z = torch.empty(output_view, dtype=x.dtype if dtype is None else dtype, device=x.device)
-            z = torch.baddbmm(z,x,y,beta=0,alpha=scale_factor)
+            # Always copies
+            x = x.reshape(output_view[0], *x.size()[2:])
+            # No copy when layer_past is provided.
+            y = y.reshape(output_view[0], *y.size()[2:])
+        # This is identical to matmul when scale_factor==1
+        z = torch.empty(output_view, dtype=x.dtype if dtype is None else dtype, device=x.device)
+        z = torch.baddbmm(z, x, y, beta=0, alpha=scale_factor)
         return z.view(output_shape)
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=False):
@@ -230,17 +229,20 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=F
             key_length = key.size(-2)
             if self.is_mqa:
                 # (b, sq, nh, sk)
-                query_length = query.size(1)
-                causal_mask = self.bias[:, key_length - query_length : key_length, :, :key_length].to(torch.bool)
+                causal_mask = self.bias[None, key_length - query.size(1) : key_length, None, :key_length]
             else:
                 # (b, nh, sq, sk)
-                query_length = query.size(-2)
-                causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+                causal_mask = self.bias[None, None, key_length - query.size(-2) : key_length, :key_length]
+            # torch.where expects a tensor. We use a cache to avoid recreating it every time.
+            if (
+                self.mask_value is None
+                or self.mask_value.dtype != attn_weights.dtype
+                or self.mask_value.device != attn_weights.device
+            ):
+                self.mask_value = torch.full(
+                    [], torch.finfo(attn_weights.dtype).min, dtype=attn_weights.dtype, device=attn_weights.device
+                )
+            attn_weights = torch.where(causal_mask, attn_weights, self.mask_value)
 
         if attention_mask is not None:
             # Apply the attention mask

From 98319da5defd1462c05f7eba1b9e215c704b274e Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 26 Jan 2023 18:53:44 -0500
Subject: [PATCH 22/29] Best GeLU]

---
 src/transformers/activations.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/transformers/activations.py b/src/transformers/activations.py
index d9caf8763e..0c383a37de 100644
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -35,6 +35,15 @@ def forward(self, input: Tensor) -> Tensor:
         return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
 
 
+class NewGELUActivationPython(nn.Module):
+    """
+    Same as NewGELUActivation (up to rounding errors), with a fast C/cuda implementation.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return nn.functional.gelu(input, approximate="tanh")
+
+
 class GELUActivation(nn.Module):
     """
     Original Implementation of the GELU activation function in Google BERT repo when initially created. For
@@ -154,6 +163,7 @@ def __getitem__(self, key):
     "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
     "gelu_fast": FastGELUActivation,
     "gelu_new": NewGELUActivation,
+    "gelu_new_python": NewGELUActivationPython,
     "gelu_python": (GELUActivation, {"use_gelu_python": True}),
     "linear": LinearActivation,
     "mish": MishActivation,

From d81e46fac8ce5472caa425b5049432b73cf96944 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Fri, 27 Jan 2023 18:49:37 -0500
Subject: [PATCH 23/29] simpler gelu

---
 src/transformers/activations.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/transformers/activations.py b/src/transformers/activations.py
index 0c383a37de..5f2cc49432 100644
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -35,15 +35,6 @@ def forward(self, input: Tensor) -> Tensor:
         return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
 
 
-class NewGELUActivationPython(nn.Module):
-    """
-    Same as NewGELUActivation (up to rounding errors), with a fast C/cuda implementation.
-    """
-
-    def forward(self, input: Tensor) -> Tensor:
-        return nn.functional.gelu(input, approximate="tanh")
-
-
 class GELUActivation(nn.Module):
     """
     Original Implementation of the GELU activation function in Google BERT repo when initially created. For
@@ -163,7 +154,7 @@ def __getitem__(self, key):
     "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
     "gelu_fast": FastGELUActivation,
     "gelu_new": NewGELUActivation,
-    "gelu_new_python": NewGELUActivationPython,
+    "gelu_new_python": (nn.GELU, {"approximate": "tanh"}),
     "gelu_python": (GELUActivation, {"use_gelu_python": True}),
     "linear": LinearActivation,
     "mish": MishActivation,

From a1d7a9590b880e9981bece0a0eaca23a3c4f3e8d Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 7 Feb 2023 15:10:49 -0500
Subject: [PATCH 24/29] Move code

---
 .../models/gpt2/configuration_gpt2.py         |  11 --
 src/transformers/models/gpt2/modeling_gpt2.py | 179 +++++++++---------
 .../gpt_bigcode/configuration_gpt_bigcode.py  |  14 +-
 .../gpt_bigcode/modeling_gpt_bigcode.py       | 179 +++++++++---------
 4 files changed, 191 insertions(+), 192 deletions(-)

diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index fb5a3422e6..fe9c711d73 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """ OpenAI GPT-2 configuration"""
 from collections import OrderedDict
-from enum import Enum
 from typing import Any, List, Mapping, Optional
 
 from transformers import PreTrainedTokenizer, TensorType, is_torch_available
@@ -36,12 +35,6 @@
 }
 
 
-class AttentionType(Enum):
-    MULTI_HEAD = 1
-    MULTI_QUERY_1 = 2
-    MULTI_QUERY_2 = 3
-
-
 class GPT2Config(PretrainedConfig):
     """
     This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
@@ -170,7 +163,6 @@ def __init__(
         eos_token_id=50256,
         scale_attn_by_inverse_layer_idx=False,
         reorder_and_upcast_attn=False,
-        attention_type=AttentionType.MULTI_HEAD,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -198,9 +190,6 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
 
-        # Convert to an int so it's JSON-serializable.
-        self.attention_type = AttentionType(attention_type).value
-
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 72f1428b4d..5fe33bbca5 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -44,7 +44,7 @@
     replace_return_docstrings,
 )
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
-from .configuration_gpt2 import AttentionType, GPT2Config
+from .configuration_gpt2 import GPT2Config
 
 
 logger = logging.get_logger(__name__)
@@ -122,22 +122,19 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
 class GPT2Attention(nn.Module):
     def __init__(self, config, is_cross_attention=False, layer_idx=None):
         super().__init__()
+
         max_positions = config.max_position_embeddings
         self.register_buffer(
-            "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+                1, 1, max_positions, max_positions
+            ),
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
-        # We don't use a buffer because the mask value depends on the dtype,
-        # And the dtype will be different if upcasting.
-        self.mask_value = None
-
-        self.attention_type = AttentionType(config.attention_type)
-        self.is_mqa = self.attention_type != AttentionType.MULTI_HEAD
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
-        self.kv_dim = self.embed_dim if self.attention_type == AttentionType.MULTI_HEAD else self.head_dim
         self.split_size = self.embed_dim
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
@@ -154,19 +151,10 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
 
         if self.is_cross_attention:
-            if self.is_mqa:
-                raise NotImplementedError(f"attention_type {self.attention_type}  for cross_attention")
-
             self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
             self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
         else:
-            if self.attention_type == AttentionType.MULTI_QUERY_2:
-                self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
-                # Keys and values are shared across heads
-                self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
-            else:
-                self.c_attn = Conv1D(self.embed_dim + 2 * self.kv_dim, self.embed_dim)
-
+            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
 
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
@@ -189,59 +177,77 @@ def prune_heads(self, heads):
         self.num_heads = self.num_heads - len(heads)
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def _matmul(self, x, y, dtype=None, scale_factor=1.0):
-        output_shape = (*x.size()[:-1], y.size(-1))
-        if self.is_mqa:
-            # Q x K: (b, sq, nh, hs) x (b, hs, sk) -> (b, sq, nh, sk)
-            # A X V: (b, sq, nh, sk) x (b, sk, hs) -> (b, sq, nh, hs)
-            output_view = (x.size(0), x.size(1) * x.size(2), y.size(-1))
-            # No copy needed for MQA 2, or when layer_past is provided.
-            x = x.reshape(*output_view[:-1], x.size(-1))
-        else:
-            # Q x K: (b, nh, sq, hs) x (b, nh, hs, sk) -> (b, nh, sq, sk)
-            # A X V: (b, nh, sq, sk) x (b, nh, sk, hs) -> (b, nh, sq, hs)
-            output_view = (x.size(0) * x.size(1), x.size(2), y.size(-1))
-            # Always copies
-            x = x.reshape(output_view[0], *x.size()[2:])
-            # No copy when layer_past is provided.
-            y = y.reshape(output_view[0], *y.size()[2:])
-        # This is identical to matmul when scale_factor==1
-        z = torch.empty(output_view, dtype=x.dtype if dtype is None else dtype, device=x.device)
-        z = torch.baddbmm(z, x, y, beta=0, alpha=scale_factor)
-        return z.view(output_shape)
-
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=False):
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch.full(
+                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+            )
+
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+
+        # Compute Scale Factor
         scale_factor = 1.0
         if self.scale_attn_weights:
-            scale_factor /= value.size(-1) ** 0.5
+            scale_factor /= float(value.size(-1)) ** 0.5
 
         if self.scale_attn_by_inverse_layer_idx:
-            scale_factor /= self.layer_idx + 1
+            scale_factor /= float(self.layer_idx + 1)
 
+        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
         with autocast(enabled=False):
-            attn_weights = self._matmul(
-                query, key.transpose(-1, -2), dtype=torch.float32 if upcast else None, scale_factor=scale_factor
-            )
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
 
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
-            key_length = key.size(-2)
-            if self.is_mqa:
-                # (b, sq, nh, sk)
-                causal_mask = self.bias[None, key_length - query.size(1) : key_length, None, :key_length]
-            else:
-                # (b, nh, sq, sk)
-                causal_mask = self.bias[None, None, key_length - query.size(-2) : key_length, :key_length]
-            # torch.where expects a tensor. We use a cache to avoid recreating it every time.
-            if (
-                self.mask_value is None
-                or self.mask_value.dtype != attn_weights.dtype
-                or self.mask_value.device != attn_weights.device
-            ):
-                self.mask_value = torch.full(
-                    [], torch.finfo(attn_weights.dtype).min, dtype=attn_weights.dtype, device=attn_weights.device
-                )
-            attn_weights = torch.where(causal_mask, attn_weights, self.mask_value)
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
 
         if attention_mask is not None:
             # Apply the attention mask
@@ -249,8 +255,8 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=F
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
-        if upcast and attn_weights.dtype != torch.float32:
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
             raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
         attn_weights = attn_weights.type(value.dtype)
         attn_weights = self.attn_dropout(attn_weights)
@@ -259,26 +265,23 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=F
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
 
-        attn_output = self._matmul(attn_weights, value)
+        attn_output = torch.matmul(attn_weights, value)
 
         return attn_output, attn_weights
 
-    def _split_heads(self, tensor, num_heads, attn_head_size, permute=True):
+    def _split_heads(self, tensor, num_heads, attn_head_size):
         """
         Splits hidden_size dim into attn_head_size and num_heads
         """
         new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
         tensor = tensor.view(new_shape)
-        if permute:
-            tensor = tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-        return tensor
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
 
-    def _merge_heads(self, tensor, num_heads, attn_head_size, permute=True):
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
         """
         Merges attn_head_size dim and num_attn_heads dim into hidden_size
         """
-        if permute:
-            tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
         new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
         return tensor.view(new_shape)
 
@@ -292,9 +295,9 @@ def forward(
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], ...]:
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
         if encoder_hidden_states is not None:
-            if not hasattr(self, "q_attn") or not self.is_cross_attention:
+            if not hasattr(self, "q_attn"):
                 raise ValueError(
                     "If class is used as cross attention, the weights `q_attn` have to be defined. "
                     "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
@@ -304,16 +307,11 @@ def forward(
             key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
             attention_mask = encoder_attention_mask
         else:
-            if self.attention_type == AttentionType.MULTI_QUERY_2:
-                query = self.q_attn(hidden_states)
-                key, value = self.kv_attn(hidden_states).split((self.kv_dim, self.kv_dim), dim=2)
-            else:
-                query, key, value = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
+            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
 
-        query = self._split_heads(query, self.num_heads, self.head_dim, permute=not self.is_mqa)
-        if not self.is_mqa:
-            key = self._split_heads(key, self.num_heads, self.head_dim)
-            value = self._split_heads(value, self.num_heads, self.head_dim)
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
 
         if layer_past is not None:
             past_key, past_value = layer_past
@@ -325,11 +323,12 @@ def forward(
         else:
             present = None
 
-        attn_output, attn_weights = self._attn(
-            query, key, value, attention_mask, head_mask, upcast=self.reorder_and_upcast_attn
-        )
+        if self.reorder_and_upcast_attn:
+            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
+        else:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
 
-        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim, permute=not self.is_mqa)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
         attn_output = self.c_proj(attn_output)
         attn_output = self.resid_dropout(attn_output)
 
@@ -368,8 +367,6 @@ def __init__(self, config, layer_idx=None):
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
 
         if config.add_cross_attention:
-            if config.attention_type != AttentionType.MULTI_HEAD:
-                raise NotImplementedError("Cross-attention not implemented for MQA")
             self.crossattention = GPT2Attention(config, is_cross_attention=True, layer_idx=layer_idx)
             self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
 
diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
index 8fcf554ded..9fcfdd7cea 100644
--- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 """ OpenAI GPT-2 configuration"""
 from collections import OrderedDict
+from enum import Enum
 from typing import Any, List, Mapping, Optional
 
 from transformers import PreTrainedTokenizer, TensorType, is_torch_available
@@ -31,6 +32,12 @@
 }
 
 
+class AttentionType(Enum):
+    MULTI_HEAD = 1
+    MULTI_QUERY_1 = 2
+    MULTI_QUERY_2 = 3
+
+
 class GPTBigCodeConfig(PretrainedConfig):
     """
     # TODO: Update doc
@@ -160,6 +167,7 @@ def __init__(
         eos_token_id=50256,
         scale_attn_by_inverse_layer_idx=False,
         reorder_and_upcast_attn=False,
+        attention_type=AttentionType.MULTI_HEAD,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -187,11 +195,13 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
 
+        # Convert to an int so it's JSON-serializable.
+        self.attention_type = AttentionType(attention_type).value
+
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
 
-class GPTBigCodeOnnxConfig(OnnxConfigWithPast):
-    # TODO: Onnx support?
+class GPT2OnnxConfig(OnnxConfigWithPast):
     def __init__(
         self,
         config: PretrainedConfig,
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 8625e77759..e0bade8eb5 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -44,7 +44,7 @@
     replace_return_docstrings,
 )
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
-from .configuration_gpt_bigcode import GPTBigCodeConfig
+from .configuration_gpt_bigcode import AttentionType, GPTBigCodeConfig
 
 
 logger = logging.get_logger(__name__)
@@ -59,7 +59,6 @@
 
 def load_tf_weights_in_gpt_bigcode(model, config, gpt_bigcode_checkpoint_path):
     """Load tf checkpoints in a pytorch model"""
-    # TODO: Update this.
     try:
         import re
 
@@ -121,16 +120,20 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
 
         max_positions = config.max_position_embeddings
         self.register_buffer(
-            "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
-                1, 1, max_positions, max_positions
-            ),
+            "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
         )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+        # We don't use a buffer because the mask value depends on the dtype,
+        # And the dtype will be different if upcasting.
+        self.mask_value = None
+
+        self.attention_type = AttentionType(config.attention_type)
+        self.is_mqa = self.attention_type != AttentionType.MULTI_HEAD
 
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
+        self.kv_dim = self.embed_dim if self.attention_type == AttentionType.MULTI_HEAD else self.head_dim
         self.split_size = self.embed_dim
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
@@ -147,10 +150,19 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
 
         if self.is_cross_attention:
+            if self.is_mqa:
+                raise NotImplementedError(f"attention_type {self.attention_type}  for cross_attention")
+
             self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
             self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
         else:
-            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+            if self.attention_type == AttentionType.MULTI_QUERY_2:
+                self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+                # Keys and values are shared across heads
+                self.kv_attn = Conv1D(2 * self.head_dim, self.embed_dim)
+            else:
+                self.c_attn = Conv1D(self.embed_dim + 2 * self.kv_dim, self.embed_dim)
+
         self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
 
         self.attn_dropout = nn.Dropout(config.attn_pdrop)
@@ -173,77 +185,59 @@ def prune_heads(self, heads):
         self.num_heads = self.num_heads - len(heads)
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            )
-
-        # Layer-wise attention scaling
-        if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-
-        if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
-        bsz, num_heads, q_seq_len, dk = query.size()
-        _, _, k_seq_len, _ = key.size()
-
-        # Preallocate attn_weights for `baddbmm`
-        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
-
-        # Compute Scale Factor
+    def _matmul(self, x, y, dtype=None, scale_factor=1.0):
+        output_shape = (*x.size()[:-1], y.size(-1))
+        if self.is_mqa:
+            # Q x K: (b, sq, nh, hs) x (b, hs, sk) -> (b, sq, nh, sk)
+            # A X V: (b, sq, nh, sk) x (b, sk, hs) -> (b, sq, nh, hs)
+            output_view = (x.size(0), x.size(1) * x.size(2), y.size(-1))
+            # No copy needed for MQA 2, or when layer_past is provided.
+            x = x.reshape(*output_view[:-1], x.size(-1))
+        else:
+            # Q x K: (b, nh, sq, hs) x (b, nh, hs, sk) -> (b, nh, sq, sk)
+            # A X V: (b, nh, sq, sk) x (b, nh, sk, hs) -> (b, nh, sq, hs)
+            output_view = (x.size(0) * x.size(1), x.size(2), y.size(-1))
+            # Always copies
+            x = x.reshape(output_view[0], *x.size()[2:])
+            # No copy when layer_past is provided.
+            y = y.reshape(output_view[0], *y.size()[2:])
+        # This is identical to matmul when scale_factor==1
+        z = torch.empty(output_view, dtype=x.dtype if dtype is None else dtype, device=x.device)
+        z = torch.baddbmm(z, x, y, beta=0, alpha=scale_factor)
+        return z.view(output_shape)
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=False):
         scale_factor = 1.0
         if self.scale_attn_weights:
-            scale_factor /= float(value.size(-1)) ** 0.5
+            scale_factor /= value.size(-1) ** 0.5
 
         if self.scale_attn_by_inverse_layer_idx:
-            scale_factor /= float(self.layer_idx + 1)
+            scale_factor /= self.layer_idx + 1
 
-        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
         with autocast(enabled=False):
-            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
-            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
-            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+            attn_weights = self._matmul(
+                query, key.transpose(-1, -2), dtype=torch.float32 if upcast else None, scale_factor=scale_factor
+            )
 
         if not self.is_cross_attention:
             # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+            key_length = key.size(-2)
+            if self.is_mqa:
+                # (b, sq, nh, sk)
+                causal_mask = self.bias[None, key_length - query.size(1) : key_length, None, :key_length]
+            else:
+                # (b, nh, sq, sk)
+                causal_mask = self.bias[None, None, key_length - query.size(-2) : key_length, :key_length]
+            # torch.where expects a tensor. We use a cache to avoid recreating it every time.
+            if (
+                self.mask_value is None
+                or self.mask_value.dtype != attn_weights.dtype
+                or self.mask_value.device != attn_weights.device
+            ):
+                self.mask_value = torch.full(
+                    [], torch.finfo(attn_weights.dtype).min, dtype=attn_weights.dtype, device=attn_weights.device
+                )
+            attn_weights = torch.where(causal_mask, attn_weights, self.mask_value)
 
         if attention_mask is not None:
             # Apply the attention mask
@@ -251,8 +245,8 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
-        if attn_weights.dtype != torch.float32:
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        if upcast and attn_weights.dtype != torch.float32:
             raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
         attn_weights = attn_weights.type(value.dtype)
         attn_weights = self.attn_dropout(attn_weights)
@@ -261,23 +255,26 @@ def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, hea
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
 
-        attn_output = torch.matmul(attn_weights, value)
+        attn_output = self._matmul(attn_weights, value)
 
         return attn_output, attn_weights
 
-    def _split_heads(self, tensor, num_heads, attn_head_size):
+    def _split_heads(self, tensor, num_heads, attn_head_size, permute=True):
         """
         Splits hidden_size dim into attn_head_size and num_heads
         """
         new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
         tensor = tensor.view(new_shape)
-        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+        if permute:
+            tensor = tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+        return tensor
 
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
+    def _merge_heads(self, tensor, num_heads, attn_head_size, permute=True):
         """
         Merges attn_head_size dim and num_attn_heads dim into hidden_size
         """
-        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        if permute:
+            tensor = tensor.permute(0, 2, 1, 3).contiguous()
         new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
         return tensor.view(new_shape)
 
@@ -291,9 +288,9 @@ def forward(
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], ...]:
         if encoder_hidden_states is not None:
-            if not hasattr(self, "q_attn"):
+            if not hasattr(self, "q_attn") or not self.is_cross_attention:
                 raise ValueError(
                     "If class is used as cross attention, the weights `q_attn` have to be defined. "
                     "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
@@ -303,11 +300,16 @@ def forward(
             key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
             attention_mask = encoder_attention_mask
         else:
-            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+            if self.attention_type == AttentionType.MULTI_QUERY_2:
+                query = self.q_attn(hidden_states)
+                key, value = self.kv_attn(hidden_states).split((self.kv_dim, self.kv_dim), dim=2)
+            else:
+                query, key, value = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
 
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
+        query = self._split_heads(query, self.num_heads, self.head_dim, permute=not self.is_mqa)
+        if not self.is_mqa:
+            key = self._split_heads(key, self.num_heads, self.head_dim)
+            value = self._split_heads(value, self.num_heads, self.head_dim)
 
         if layer_past is not None:
             past_key, past_value = layer_past
@@ -319,12 +321,11 @@ def forward(
         else:
             present = None
 
-        if self.reorder_and_upcast_attn:
-            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
-        else:
-            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        attn_output, attn_weights = self._attn(
+            query, key, value, attention_mask, head_mask, upcast=self.reorder_and_upcast_attn
+        )
 
-        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim, permute=not self.is_mqa)
         attn_output = self.c_proj(attn_output)
         attn_output = self.resid_dropout(attn_output)
 
@@ -363,6 +364,8 @@ def __init__(self, config, layer_idx=None):
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
 
         if config.add_cross_attention:
+            if config.attention_type != AttentionType.MULTI_HEAD:
+                raise NotImplementedError("Cross-attention not implemented for MQA")
             self.crossattention = GPTBigCodeAttention(config, is_cross_attention=True, layer_idx=layer_idx)
             self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
 

From 138fefb832e51683b2489273ee808340466399c7 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 7 Feb 2023 15:12:29 -0500
Subject: [PATCH 25/29] fix

---
 .../models/gpt_bigcode/configuration_gpt_bigcode.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
index 9fcfdd7cea..8345c9743a 100644
--- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
@@ -201,7 +201,7 @@ def __init__(
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
 
-class GPT2OnnxConfig(OnnxConfigWithPast):
+class GPTBigCodeOnnxConfig(OnnxConfigWithPast):
     def __init__(
         self,
         config: PretrainedConfig,

From 52a6d976b9ec8544c2b891d588ae42c234942842 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 7 Feb 2023 15:17:41 -0500
Subject: [PATCH 26/29] fix

---
 src/transformers/models/gpt2/modeling_gpt2.py                    | 1 +
 src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 5fe33bbca5..a678669483 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -64,6 +64,7 @@
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """Load tf checkpoints in a pytorch model"""
+    # TODO: Update this.
     try:
         import re
 
diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
index 8345c9743a..4442bd38da 100644
--- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
@@ -202,6 +202,7 @@ def __init__(
 
 
 class GPTBigCodeOnnxConfig(OnnxConfigWithPast):
+    # TODO: Onnx support?
     def __init__(
         self,
         config: PretrainedConfig,

From b4c9cf4be52c34f9e068be96479ebd3e6ffc4a81 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 7 Feb 2023 15:18:58 -0500
Subject: [PATCH 27/29] fix

---
 src/transformers/models/gpt2/modeling_gpt2.py               | 1 -
 src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index a678669483..5fe33bbca5 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -64,7 +64,6 @@
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """Load tf checkpoints in a pytorch model"""
-    # TODO: Update this.
     try:
         import re
 
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 7c2deb3a92..4f188a25bf 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -59,6 +59,7 @@
 
 def load_tf_weights_in_gpt_bigcode(model, config, gpt_bigcode_checkpoint_path):
     """Load tf checkpoints in a pytorch model"""
+    # TODO: Update this.
     try:
         import re
 

From 3dd5a5bc7e3e37aa62604855da47dc1b22cecec8 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 7 Feb 2023 15:22:17 -0500
Subject: [PATCH 28/29] gelu

---
 src/transformers/activations.py               | 24 ++++++++++++++++---
 .../gpt_bigcode/configuration_gpt_bigcode.py  |  2 +-
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/transformers/activations.py b/src/transformers/activations.py
index 5f2cc49432..1c59568835 100644
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -25,6 +25,26 @@
 logger = logging.get_logger(__name__)
 
 
+class PytorchGELUTanh(nn.Module):
+    """
+    A fast C implementation of the tanh approximation of the GeLU activation function. See
+    https://arxiv.org/abs/1606.08415.
+    This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
+    match due to rounding errors.
+    """
+
+    def __init__(self):
+        super().__init__()
+        if version.parse(torch.__version__) < version.parse("1.12.0"):
+            raise ImportError(
+                f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use "
+                "PytorchGELUTanh. Please upgrade torch."
+            )
+
+    def forward(self, input: Tensor) -> Tensor:
+        return nn.functional.gelu(input, approximate="tanh")
+
+
 class NewGELUActivation(nn.Module):
     """
     Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
@@ -80,10 +100,8 @@ class ClippedGELUActivation(nn.Module):
     Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
     it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
     https://arxiv.org/abs/2004.09602.
-
     Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
     initially created.
-
     For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
     torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
     """
@@ -154,8 +172,8 @@ def __getitem__(self, key):
     "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
     "gelu_fast": FastGELUActivation,
     "gelu_new": NewGELUActivation,
-    "gelu_new_python": (nn.GELU, {"approximate": "tanh"}),
     "gelu_python": (GELUActivation, {"use_gelu_python": True}),
+    "gelu_pytorch_tanh": PytorchGELUTanh,
     "linear": LinearActivation,
     "mish": MishActivation,
     "quick_gelu": QuickGELUActivation,
diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
index 4442bd38da..6546b47253 100644
--- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
@@ -150,7 +150,7 @@ def __init__(
         n_layer=12,
         n_head=12,
         n_inner=None,
-        activation_function="gelu_new",
+        activation_function="gelu_pytorch_tanh",
         resid_pdrop=0.1,
         embd_pdrop=0.1,
         attn_pdrop=0.1,

From 2c13d086c5f1a202d6fde69bc3ed148f60b21b67 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 7 Feb 2023 15:37:31 -0500
Subject: [PATCH 29/29] Add changes from fast inferences

---
 .../models/gpt_bigcode/__init__.py            | 18 +++++++++-------
 .../gpt_bigcode/modeling_gpt_bigcode.py       | 21 ++++++++++---------
 .../gpt_bigcode/test_modeling_gpt_bigcode.py  | 14 ++++++++++---
 3 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/gpt_bigcode/__init__.py b/src/transformers/models/gpt_bigcode/__init__.py
index 5b585a8908..2af1863d70 100644
--- a/src/transformers/models/gpt_bigcode/__init__.py
+++ b/src/transformers/models/gpt_bigcode/__init__.py
@@ -18,15 +18,15 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
-    "configuration_gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig", "GPTBigCodeOnnxConfig"],
+    "configuration_gpt_bigcode": [
+        "GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "GPTBigCodeConfig",
+        "GPTBigCodeOnnxConfig",
+    ],
 }
 
 try:
@@ -47,7 +47,11 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig, GPTBigCodeOnnxConfig
+    from .configuration_gpt_bigcode import (
+        GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPTBigCodeConfig,
+        GPTBigCodeOnnxConfig,
+    )
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 4f188a25bf..82938e65ab 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -134,7 +134,8 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
-        self.kv_dim = self.embed_dim if self.attention_type == AttentionType.MULTI_HEAD else self.head_dim
+        self.kv_heads = 1 if self.is_mqa else self.head_dim
+        self.kv_dim = self.kv_heads * self.head_dim
         self.split_size = self.embed_dim
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
@@ -150,6 +151,13 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.layer_idx = layer_idx
         self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
 
+        self.scale_factor = 1.0
+        if self.scale_attn_weights:
+            self.scale_factor /= self.head_dim**0.5
+
+        if self.scale_attn_by_inverse_layer_idx:
+            self.scale_factor /= self.layer_idx + 1
+
         if self.is_cross_attention:
             if self.is_mqa:
                 raise NotImplementedError(f"attention_type {self.attention_type}  for cross_attention")
@@ -208,16 +216,9 @@ def _matmul(self, x, y, dtype=None, scale_factor=1.0):
         return z.view(output_shape)
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None, upcast=False):
-        scale_factor = 1.0
-        if self.scale_attn_weights:
-            scale_factor /= value.size(-1) ** 0.5
-
-        if self.scale_attn_by_inverse_layer_idx:
-            scale_factor /= self.layer_idx + 1
-
         with autocast(enabled=False):
             attn_weights = self._matmul(
-                query, key.transpose(-1, -2), dtype=torch.float32 if upcast else None, scale_factor=scale_factor
+                query, key.transpose(-1, -2), dtype=torch.float32 if upcast else None, scale_factor=self.scale_factor
             )
 
         if not self.is_cross_attention:
@@ -282,7 +283,7 @@ def _merge_heads(self, tensor, num_heads, attn_head_size, permute=True):
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index 936d4c9f76..f5f31ce05f 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -31,12 +31,12 @@
 
     from transformers import (
         GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST,
+        GPT2Tokenizer,
         GPTBigCodeDoubleHeadsModel,
         GPTBigCodeForSequenceClassification,
         GPTBigCodeForTokenClassification,
         GPTBigCodeLMHeadModel,
         GPTBigCodeModel,
-        GPT2Tokenizer,
     )
 
 
@@ -434,12 +434,20 @@ class GPTBigCodeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
     # TODO: Update the tests to use valid pretrained models.
 
     all_model_classes = (
-        (GPTBigCodeModel, GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel, GPTBigCodeForSequenceClassification, GPTBigCodeForTokenClassification)
+        (
+            GPTBigCodeModel,
+            GPTBigCodeLMHeadModel,
+            GPTBigCodeDoubleHeadsModel,
+            GPTBigCodeForSequenceClassification,
+            GPTBigCodeForTokenClassification,
+        )
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = (GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel) if is_torch_available() else ()
-    all_parallelizable_model_classes = (GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel) if is_torch_available() else ()
+    all_parallelizable_model_classes = (
+        (GPTBigCodeLMHeadModel, GPTBigCodeDoubleHeadsModel) if is_torch_available() else ()
+    )
     fx_compatible = True
     test_missing_keys = False
     test_model_parallel = True