From 7042915f170acd2423f7954aa3b49ef19c107461 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <abhi.venigalla@databricks.com>
Date: Wed, 27 Mar 2024 21:48:31 +0000
Subject: [PATCH 001/131] wip

---
 docs/source/en/model_doc/dbrx.md              |   44 +
 src/transformers/__init__.py                  |   17 +
 src/transformers/models/dbrx/__init__.py      |   77 +
 .../models/dbrx/configuration_dbrx.py         |  279 +++
 src/transformers/models/dbrx/modeling_dbrx.py | 2132 +++++++++++++++++
 tests/models/dbrx/__init__.py                 |    0
 tests/models/dbrx/test_modeling_dbrx.py       |  468 ++++
 7 files changed, 3017 insertions(+)
 create mode 100644 docs/source/en/model_doc/dbrx.md
 create mode 100644 src/transformers/models/dbrx/__init__.py
 create mode 100644 src/transformers/models/dbrx/configuration_dbrx.py
 create mode 100755 src/transformers/models/dbrx/modeling_dbrx.py
 create mode 100644 tests/models/dbrx/__init__.py
 create mode 100644 tests/models/dbrx/test_modeling_dbrx.py
diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
new file mode 100644
index 000000000000..666a65bcf036
--- /dev/null
+++ b/docs/source/en/model_doc/dbrx.md
@@ -0,0 +1,44 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DBRX
+
+## Overview
+
+The DBRX model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>)  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>). The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+## DbrxConfig
+
+[[autodoc]] DbrxConfig
+
+
+## DbrxModel
+
+[[autodoc]] DbrxModel
+    - forward
+
+
+## DbrxForCausalLM
+
+[[autodoc]] DbrxForCausalLM
+    - forward
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index da29d77972f4..4af0be04cb61 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -131,6 +131,7 @@
     ],
     "models": [],
     # Models
+    "models.dbrx": ["DbrxConfig"],
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.align": [
         "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -1442,6 +1443,15 @@
 
     # PyTorch models structure
 
+    _import_structure["models.dbrx"].extend(
+        [
+            "DbrxForCausalLM",
+            "DbrxBlock",
+            "DbrxModel",
+            "DbrxPreTrainedModel",
+        ]
+    )
+
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -7786,6 +7796,13 @@
         )
 
         # PyTorch model imports
+
+        from .models.dbrx import (
+            DbrxForCausalLM,
+            DbrxBlock,
+            DbrxModel,
+            DbrxPreTrainedModel,
+        )
         from .models.seamless_m4t import (
             SEAMLESS_M4T_PRETRAINED_MODEL_ARCHIVE_LIST,
             SeamlessM4TCodeHifiGan,
diff --git a/src/transformers/models/dbrx/__init__.py b/src/transformers/models/dbrx/__init__.py
new file mode 100644
index 000000000000..1ee030d8af83
--- /dev/null
+++ b/src/transformers/models/dbrx/__init__.py
@@ -0,0 +1,77 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import  _LazyModule, OptionalDependencyNotAvailable
+from ...utils import is_torch_available
+
+
+
+
+_import_structure = {
+    "configuration_dbrx": ["DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP", "DbrxConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_dbrx"] = [
+        "DBRX_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "DbrxForMaskedLM",
+        "DbrxForCausalLM",
+        "DbrxForMultipleChoice",
+        "DbrxForQuestionAnswering",
+        "DbrxForSequenceClassification",
+        "DbrxForTokenClassification",
+        "DbrxLayer",
+        "DbrxModel",
+        "DbrxPreTrainedModel",
+        "load_tf_weights_in_dbrx",
+    ]
+
+
+
+
+if TYPE_CHECKING:
+    from .configuration_dbrx import DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP, DbrxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_dbrx import (
+            DBRX_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DbrxForMaskedLM,
+            DbrxForCausalLM,
+            DbrxForMultipleChoice,
+            DbrxForQuestionAnswering,
+            DbrxForSequenceClassification,
+            DbrxForTokenClassification,
+            DbrxLayer,
+            DbrxModel,
+            DbrxPreTrainedModel,
+            load_tf_weights_in_dbrx,
+        )
+
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
new file mode 100644
index 000000000000..be75cdceb04b
--- /dev/null
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -0,0 +1,279 @@
+# coding=utf-8
+# Copyright 2022 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DBRX model configuration """
+
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+    """Configuration class for Dbrx Attention.
+
+    [`DbrxAttention`] class. It is used to instantiate attention layers
+    according to the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        clip_qkv (`float`, *optional*, defualts to None):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (float): The base frequency for rope.
+    """
+
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+
+        for k in ['model_type']:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f'Found unknown {kwargs=}')
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str,
+                        **kwargs: Any) -> 'PretrainedConfig':
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
+                                                  **kwargs)
+
+        if config_dict.get('model_type') == 'dbrx':
+            config_dict = config_dict['attn_config']
+
+        if 'model_type' in config_dict and hasattr(
+                cls,
+                'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                +
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxFFNConfig(PretrainedConfig):
+    """Configuration class for Dbrx FFN.
+
+    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+    the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+            The dict should have a key 'name' with the value being the name of
+            the activation function along with any additional keyword arguments.
+        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
+        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
+        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
+        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
+        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+            This should only be used for benchmarking purposes.
+    """
+
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {'name': 'silu'}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        for k in ['model_type']:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f'Found unknown {kwargs=}')
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str,
+                        **kwargs: Any) -> 'PretrainedConfig':
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
+                                                  **kwargs)
+
+        if config_dict.get('model_type') == 'dbrx':
+            config_dict = config_dict['ffn_config']
+
+        if 'model_type' in config_dict and hasattr(
+                cls,
+                'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                +
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxConfig(PretrainedConfig):
+    """Configuration class for Dbrx.
+
+    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        d_model (`int`, *optional*, defaults to 6144):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        max_seq_len (`int`, *optional*, defaults to 32768):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DbrxModel`].
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to the attention output before combining with residual.
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the embedding layer.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        ffn_config (`dict`, *optional*):
+            A dictionary used to configure the model's FFN module.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+
+    Example:
+    ```python
+    >>> from transformers import DbrxConfig, DbrxModel
+
+    >>> # Initializing a Dbrx configuration
+    >>> configuration = DbrxConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DbrxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = 'dbrx'
+    attribute_map = {
+        'num_attention_heads': 'n_heads',
+        'hidden_size': 'd_model',
+        'num_hidden_layers': 'n_layers',
+        'max_position_embeddings': 'max_seq_len'
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        tie_word_embeddings = kwargs.pop('tie_word_embeddings', False)
+        if tie_word_embeddings:
+            raise ValueError(
+                'tie_word_embeddings is not supported for Dbrx models.')
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
\ No newline at end of file
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
new file mode 100755
index 000000000000..9a0b62dbc355
--- /dev/null
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -0,0 +1,2132 @@
+# coding=utf-8
+# Copyright 2022 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DBRX model. """
+
+import math
+import warnings
+from copy import deepcopy
+from functools import partial
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import (MoeCausalLMOutputWithPast,
+                                           MoeModelOutputWithPast)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import is_flash_attn_2_available, logging
+
+from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
+
+if is_flash_attn_2_available():
+    try:
+        from flash_attn import flash_attn_func, flash_attn_varlen_func
+        from flash_attn.bert_padding import pad_input  # noqa
+        from flash_attn.bert_padding import index_first_axis, unpad_input
+    except:
+        pass
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'DbrxConfig'
+
+#############################################################################
+# Copied from LLaMaRotaryEmbedding
+#############################################################################
+
+
+class DbrxRotaryEmbedding(nn.Module):
+
+    def __init__(self,
+                 dim: int,
+                 max_position_embeddings: int = 2048,
+                 base: float = 10000.0,
+                 scaling_factor: float = 1.0):
+        super().__init__()
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base**(
+            torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+        # For BC we register cos and sin cached
+        self.max_seq_len_cached = max_position_embeddings
+
+    @torch.no_grad()
+    def forward(
+            self, x: torch.Tensor, position_ids: torch.LongTensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
+            position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(
+            device_type, str) and device_type != 'mps' else 'cpu'
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float()
+                     @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        unsqueeze_dim: int = 1) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos and
+            sin so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos and sin have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos and sin broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
+
+    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
+    (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :,
+                                  None, :, :].expand(batch, num_key_value_heads,
+                                                     n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
+                                 head_dim)
+
+
+#############################################################################
+
+#############################################################################
+# Modified from modeling_mixtral
+#############################################################################
+
+
+def load_balancing_loss_func(
+    gate_logits: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    attention_mask: Optional[torch.Tensor],
+) -> torch.Tensor:
+    r"""Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts (`int`):
+            Number of experts.
+        top_k (`int`):
+            The number of experts each token is routed to.
+        attention_mask (`torch.Tensor`, None):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return torch.tensor(0.0)
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat(
+            [layer_gate.to(compute_device) for layer_gate in gate_logits],
+            dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits,
+                                                  dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (
+            batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (attention_mask[None, :, :, None, None].expand(
+            (num_hidden_layers, batch_size, sequence_length, top_k,
+             num_experts)).reshape(-1, top_k, num_experts).to(compute_device))
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(
+            expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+                expert_attention_mask, dim=0)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None].expand(
+                (num_hidden_layers, batch_size, sequence_length,
+                 num_experts)).reshape(-1, num_experts).to(compute_device))
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(
+            routing_weights * router_per_expert_attention_mask,
+            dim=0) / torch.sum(router_per_expert_attention_mask, dim=0)
+
+    overall_loss = torch.sum(tokens_per_expert *
+                             router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+#############################################################################
+
+
+def resolve_ffn_act_fn(
+        ffn_act_fn: dict) -> Callable[[torch.Tensor], torch.Tensor]:
+    """Resolve the activation function for the feed-forward network.
+
+    Args:
+        ffn_act_fn (dict): The configuration dictionary for the activation function.
+            The dict config must specify the 'name' of a torch.nn.functional activation
+            function. All of other key values pairs are bound to the function as a partial.
+
+    Returns:
+        Callable[[torch.Tensor], torch.Tensor]: The activation function.
+    """
+    config = deepcopy(ffn_act_fn)
+    name = config.pop('name')
+    if not hasattr(nn.functional, name):
+        raise ValueError(f'Unrecognised activation function name ({name}).')
+    act = getattr(nn.functional, name)
+    return partial(act, **config)
+
+
+#############################################################################
+# Copied from LLaMaAttention
+#############################################################################
+
+
+def _get_unpad_data(attention_mask: torch.Tensor):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32),
+                       (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+class DbrxAttention(nn.Module):
+    """Multi-head self attention."""
+
+    def __init__(self,
+                 hidden_size: int,
+                 num_heads: int,
+                 max_position_embeddings: int,
+                 attn_config: DbrxAttentionConfig,
+                 block_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.block_idx = block_idx
+        self.config = attn_config
+        if block_idx is None:
+            logger.warning_once(
+                f'Instantiating {self.__class__.__name__} without passing a `block_idx` is not recommended and will '
+                +
+                'lead to errors during the forward call if caching is used. Please make sure to provide a `block_idx` '
+                + 'when creating this class.')
+
+        self.attn_pdrop = attn_config.attn_pdrop
+        self.clip_qkv = attn_config.clip_qkv
+        self.num_key_value_heads = attn_config.kv_n_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.rope_theta = attn_config.rope_theta
+
+        self.Wqkv = nn.Linear(self.hidden_size,
+                              self.hidden_size +
+                              2 * self.num_key_value_heads * self.head_dim,
+                              bias=False)
+        self.out_proj = nn.Linear(self.hidden_size,
+                                  self.hidden_size,
+                                  bias=False)
+        self.rotary_emb = DbrxRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Any,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv_states = self.Wqkv(hidden_states)
+        if self.clip_qkv is not None:
+            qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+        query_states, key_states, value_states = qkv_states.split(
+            [
+                self.hidden_size,
+                self.num_key_value_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim,
+            ],
+            dim=2,
+        )
+
+        query_states = query_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        past_key_value = getattr(self, 'past_key_value', past_key_value)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+                                                        key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; position_ids needed for the static cache
+            cache_kwargs = {
+                'sin': sin,
+                'cos': cos,
+                'cache_position': cache_position
+            }
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.block_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(
+            2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights,
+                                             dim=-1,
+                                             dtype=torch.float32).to(
+                                                 query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights,
+                                             p=self.attn_pdrop,
+                                             training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                + f' {attn_output.size()}')
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class DbrxFlashAttention2(DbrxAttention):
+    """Dbrx flash attention module.
+
+    This module inherits from `DbrxAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it
+    calls the public API of flash attention.
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any):
+        if not is_flash_attn_2_available():
+            raise ImportError(
+                'Flash Attention 2 is not available. Please install it with `pip install flash-attn`.'
+            )
+
+        super().__init__(*args, **kwargs)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Any,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        logger.info(
+            'Implicitly setting `output_attentions` to False as it is not supported in Flash Attention.'
+        )
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv_states = self.Wqkv(hidden_states)
+        if self.clip_qkv is not None:
+            qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+        query_states, key_states, value_states = qkv_states.split(
+            [
+                self.hidden_size,
+                self.num_key_value_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim,
+            ],
+            dim=2,
+        )
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states,
+                                                        key_states, cos, sin)
+
+        past_key_value = getattr(self, 'past_key_value', past_key_value)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                'sin': sin,
+                'cos': cos,
+                'cache_position': cache_position
+            }
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.block_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout
+        # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attn_pdrop if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, '_pre_quantization_dtype'):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = query_states.dtype
+
+            logger.warning_once(
+                f'The input hidden states seems to be silently casted in float32, this might be '
+                +
+                f'related to the fact you have upcasted embedding or layer norm layers in '
+                + f'float32. We will cast back the input in {target_dtype}.')
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len,
+                                          self.hidden_size).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value  # type: ignore
+
+    def _flash_attention_forward(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        attention_mask: Union[torch.LongTensor, None],
+        query_length: int,
+        dropout: float = 0.0,
+        softmax_scale: Optional[float] = None,
+    ):
+        """Use FlashAttention, stripping padding tokens if necessary.
+
+        Args:
+            query_states (torch.Tensor): Input query states to be passed to Flash Attention API
+            key_states (torch.Tensor): Input key states to be passed to Flash Attention API
+            value_states (torch.Tensor): Input value states to be passed to Flash Attention API
+            attention_mask (torch.LongTensor | None): The padding mask - corresponds to a tensor of size
+                (batch_size, seq_len) where 0 stands for the position of padding tokens and 1
+                for the position of non-padding tokens.
+            query_length (int): The length of the query sequence
+            dropout (float): Attention dropout
+            softmax_scale (float, optional): The scaling of QK^T before applying softmax.
+                Defaults to 1 / sqrt(head_dim)
+        """
+        causal = True
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask,
+                query_length)
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(
+                attn_output_unpad,
+                indices_q,
+                batch_size,
+                query_length,
+            )
+        else:
+            attn_output = flash_attn_func(
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+        return attn_output
+
+    def _upad_input(self, query_layer: torch.Tensor, key_layer: torch.Tensor,
+                    value_layer: torch.Tensor, attention_mask: torch.Tensor,
+                    query_length: int):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
+            attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                              head_dim), indices_k)
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                                head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
+                                    head_dim), indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+                query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+DBRX_ATTENTION_CLASSES = {
+    'eager': DbrxAttention,
+    'flash_attention_2': DbrxFlashAttention2,
+}
+
+
+class DbrxNormAttentionNorm(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        max_position_embeddings: int,
+        resid_pdrop: float,
+        attn_implementation: str,
+        attn_config: DbrxAttentionConfig,
+        block_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.block_idx = block_idx
+        self.resid_pdrop = resid_pdrop
+        self.norm_1 = nn.LayerNorm(hidden_size, bias=False)
+        self.attn = DBRX_ATTENTION_CLASSES[attn_implementation](
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            max_position_embeddings=max_position_embeddings,
+            attn_config=attn_config,
+            block_idx=block_idx,
+        )
+        self.norm_2 = nn.LayerNorm(hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Any,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
+               Optional[Cache]]:
+
+        residual_states = hidden_states
+        hidden_states = self.norm_1(hidden_states).to(hidden_states.dtype)
+
+        hidden_states, attn_weights, past_key_value = self.attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.resid_pdrop,
+                                              training=self.training)
+        hidden_states = hidden_states + residual_states
+
+        residual_states = hidden_states
+        hidden_states = self.norm_2(hidden_states).to(hidden_states.dtype)
+
+        return residual_states, hidden_states, attn_weights, past_key_value
+
+
+class DbrxRouter(nn.Module):
+
+    def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int,
+                 moe_jitter_eps: Optional[float],
+                 moe_normalize_expert_weights: Optional[float],
+                 uniform_expert_assignment: bool):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        self.layer = nn.Linear(self.hidden_size,
+                               self.moe_num_experts,
+                               bias=False)
+
+    def jitter(self, x: torch.Tensor) -> torch.Tensor:
+        if self.moe_jitter_eps is None:
+            raise RuntimeError('The router does not have moe_jitter_eps set.')
+        low = 1.0 - self.moe_jitter_eps
+        high = 1.0 + self.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
+    def forward(
+            self, x: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
+        if self.training and self.moe_jitter_eps is not None:
+            x = x * self.jitter(x)
+
+        weights = self.layer(x.view(-1,
+                                    x.shape[-1])).softmax(dim=-1,
+                                                          dtype=torch.float32)
+        top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
+
+        if self.moe_normalize_expert_weights:
+            top_weights = top_weights / torch.norm(
+                top_weights,
+                p=self.moe_normalize_expert_weights,
+                dim=-1,
+                keepdim=True)
+
+        if self.uniform_expert_assignment:
+            with torch.no_grad():
+                uniform_tensor = torch.arange(
+                    0,
+                    top_experts.numel(),
+                    device=top_experts.device,
+                    dtype=top_experts.dtype) % self.moe_num_experts
+                top_experts = uniform_tensor.reshape(top_experts.shape)
+                # Note, weights and top_weights are not changed
+
+        weights = weights.to(x.dtype)
+        top_weights = top_weights.to(x.dtype)
+        return weights, top_weights, top_experts  # type: ignore
+
+
+class DbrxExpertGLU(nn.Module):
+
+    def __init__(self, hidden_size: int, ffn_hidden_size: int,
+                 moe_num_experts: int, ffn_act_fn: dict):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+
+        self.w1 = nn.Parameter(
+            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.v1 = nn.Parameter(
+            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.w2 = nn.Parameter(
+            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.activation_fn = resolve_ffn_act_fn(ffn_act_fn)
+
+    def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
+        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size,
+                                 self.hidden_size)[expert_idx]
+        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size,
+                                 self.hidden_size)[expert_idx]
+        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size,
+                                 self.hidden_size)[expert_idx]
+
+        x1 = x.matmul(expert_w1.t())
+        x2 = x.matmul(expert_v1.t())
+        x1 = self.activation_fn(x1)
+        x1 = x1 * x2
+        x1 = x1.matmul(expert_w2)
+        return x1
+
+
+class DbrxExperts(nn.Module):
+
+    def __init__(self, hidden_size: int, ffn_hidden_size: int,
+                 moe_num_experts: int, ffn_act_fn: dict):
+        super().__init__()
+        self.moe_num_experts = moe_num_experts
+        self.mlp = DbrxExpertGLU(hidden_size=hidden_size,
+                                 ffn_hidden_size=ffn_hidden_size,
+                                 moe_num_experts=moe_num_experts,
+                                 ffn_act_fn=ffn_act_fn)
+
+    def forward(self, x: torch.Tensor, weights: torch.Tensor,
+                top_weights: torch.Tensor,
+                top_experts: torch.LongTensor) -> torch.Tensor:
+        bsz, q_len, hidden_size = x.shape
+        x = x.view(-1, hidden_size)
+        out = torch.zeros_like(x)
+
+        expert_mask = nn.functional.one_hot(
+            top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
+        for expert_idx in range(0, self.moe_num_experts):
+            topk_idx, token_idx = torch.where(expert_mask[expert_idx])
+            if token_idx.shape[0] == 0:
+                continue
+
+            token_list = token_idx.tolist()
+            topk_list = topk_idx.tolist()
+
+            expert_tokens = x[None, token_list].reshape(-1, hidden_size)
+            expert_out = self.mlp(
+                expert_tokens, expert_idx) * top_weights[token_list, topk_list,
+                                                         None]
+
+            out.index_add_(0, token_idx, expert_out)
+
+        out = out.reshape(bsz, q_len, hidden_size)
+        return out
+
+
+class DbrxFFN(nn.Module):
+
+    def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig):
+        super().__init__()
+
+        self.router = DbrxRouter(
+            hidden_size,
+            moe_num_experts=ffn_config.moe_num_experts,
+            moe_top_k=ffn_config.moe_top_k,
+            moe_jitter_eps=ffn_config.moe_jitter_eps,
+            moe_normalize_expert_weights=ffn_config.
+            moe_normalize_expert_weights,
+            uniform_expert_assignment=ffn_config.uniform_expert_assignment,
+        )
+
+        self.experts = DbrxExperts(
+            hidden_size=hidden_size,
+            ffn_hidden_size=ffn_config.ffn_hidden_size,
+            moe_num_experts=ffn_config.moe_num_experts,
+            ffn_act_fn=ffn_config.ffn_act_fn,
+        )
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        weights, top_weights, top_experts = self.router(x)
+        out = self.experts(x, weights, top_weights, top_experts)
+        return out, weights
+
+
+class DbrxBlock(nn.Module):
+
+    def __init__(self, config: DbrxConfig, block_idx: int):
+        super().__init__()
+        self.hidden_size = config.d_model
+        self.resid_pdrop = config.resid_pdrop
+        self.block_idx = block_idx
+        self.norm_attn_norm = DbrxNormAttentionNorm(
+            hidden_size=config.d_model,
+            num_heads=config.n_heads,
+            max_position_embeddings=config.max_seq_len,
+            resid_pdrop=config.resid_pdrop,
+            attn_implementation=config._attn_implementation,
+            attn_config=config.attn_config,
+            block_idx=block_idx,
+        )
+        self.ffn = DbrxFFN(hidden_size=config.d_model,
+                           ffn_config=config.ffn_config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Any,
+    ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, Optional[torch.Tensor]],
+               Tuple[torch.Tensor, Optional[Cache]], Tuple[
+                   torch.Tensor, Optional[torch.Tensor], Optional[Cache]],
+               Tuple[torch.Tensor, Optional[torch.Tensor],
+                     Optional[torch.Tensor]], Tuple[
+                         torch.Tensor, Optional[Cache], Optional[torch.Tensor]],
+               Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache],
+                     Optional[torch.Tensor]],]:
+        """Forward function for DbrxBlock.
+
+        Args:
+            hidden_states (`torch.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            position_ids (`torch.LongTensor`): position ids of shape `(batch, seq_len)`
+            attention_mask (`torch.Tensor`, optional): attention mask of size (batch_size, sequence_length)
+                if flash attention is used or (batch_size, 1, query_sequence_length, key_sequence_length)
+                if default attention is used.
+            past_key_value (`Tuple(torch.Tensor)`, optional): cached past key and value projection states
+            output_attentions (`bool`, optional): Whether or not to return the attentions tensors of all
+                attention layers. See `attentions` under returned tensors for more detail.
+            output_router_logits (`bool`, optional): Whether or not to return the router logits.
+            use_cache (`bool`, optional): If set to `True`, `past_key_values` key value states are
+                returned and can be used to speed up decoding (see `past_key_values`).
+            cache_position (`torch.LongTensor`, optional): position ids of the cache
+        """
+        if 'padding_mask' in kwargs:
+            warnings.warn(
+                'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+            )
+
+        # Norm + Attention + Norm
+        resid_states, hidden_states, self_attn_weights, present_key_value = self.norm_attn_norm(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        # Fully Connected
+        hidden_states, router_logits = self.ffn(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states,
+                                              p=self.resid_pdrop,
+                                              training=self.training)
+        hidden_states = resid_states + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+class DbrxPreTrainedModel(PreTrainedModel):
+    config_class = DbrxConfig
+    base_model_prefix = 'transformer'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['DbrxBlock']
+    _skip_keys_device_placement = ['past_key_values']
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True
+
+    def _init_weights(self, module: nn.Module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, DbrxExpertGLU):
+            module.w1.data.normal_(mean=0.0, std=std)
+            module.v1.data.normal_(mean=0.0, std=std)
+            module.w2.data.normal_(mean=0.0, std=std)
+
+    def _setup_cache(self, cache_cls: Any, max_batch_size: int,
+                     max_cache_len: int):  # TODO: how to set var type of class?
+        if self.config._attn_implementation == 'flash_attention_2' and cache_cls == StaticCache:
+            raise ValueError(
+                '`static` cache implementation is not compatible with ' +
+                '`attn_implementation==flash_attention_2`. Make sure to use ' +
+                '`spda` in the mean time and open an issue at https://github.com/huggingface/transformers.'
+            )
+
+        for block in self.transformer.blocks:
+            device = block.norm_attn_norm.norm_1.weight.device
+            if hasattr(self.config, '_pre_quantization_dtype'):
+                dtype = self.config._pre_quantization_dtype
+            else:
+                dtype = block.norm_attn_norm.attn.out_proj.weight.dtype
+            block.norm_attn_norm.attn.past_key_value = cache_cls(self.config,
+                                                                 max_batch_size,
+                                                                 max_cache_len,
+                                                                 device=device,
+                                                                 dtype=dtype)
+
+    def _reset_cache(self):
+        for block in self.transformer.blocks:
+            block.norm_attn_norm.attn.past_key_value = None
+
+
+class DbrxModel(DbrxPreTrainedModel):
+    """Transformer decoder consisting of *config.num_hidden_layers*
+
+    [`DbrxBlock`] layers.
+
+    Args:
+        config: DbrxConfig
+    """
+
+    def __init__(self, config: DbrxConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.emb_pdrop = config.emb_pdrop
+
+        self.wte = nn.Embedding(config.vocab_size, config.d_model,
+                                self.padding_idx)
+        self.blocks = nn.ModuleList([
+            DbrxBlock(config, block_idx) for block_idx in range(config.n_layers)
+        ])
+        self.norm_f = nn.LayerNorm(config.d_model, bias=False)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.wte
+
+    def set_input_embeddings(self, value: nn.Embedding):
+        self.wte = value
+
+    def _autocast_input_embeddings(self,
+                                   inputs_embeds: torch.Tensor) -> torch.Tensor:
+        if inputs_embeds.device.type == 'cuda' and torch.is_autocast_enabled():
+            return inputs_embeds.to(dtype=torch.get_autocast_gpu_dtype())
+        elif inputs_embeds.device.type == 'cpu' and torch.is_autocast_cpu_enabled(
+        ):
+            return inputs_embeds.to(dtype=torch.get_autocast_cpu_dtype())
+        else:
+            return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, MoeModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        output_router_logits = (output_router_logits
+                                if output_router_logits is not None else
+                                self.config.output_router_logits)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one'
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.'
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        inputs_embeds = self._autocast_input_embeddings(
+            inputs_embeds)  # type: ignore
+        inputs_embeds = nn.functional.dropout(inputs_embeds,
+                                              p=self.emb_pdrop,
+                                              training=self.training)
+
+        past_seen_tokens = 0
+        if use_cache:  # kept for BC (cache positions)
+            if not isinstance(past_key_values, StaticCache):
+                past_key_values = DynamicCache.from_legacy_cache(
+                    past_key_values)
+                past_seen_tokens = past_key_values.get_seq_length(  # type: ignore
+                )
+
+        if cache_position is None:
+            if isinstance(past_key_values, StaticCache):
+                raise ValueError(
+                    'cache_position is a required argument when using StaticCache.'
+                )
+            cache_position = torch.arange(  # type: ignore
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device)
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)  # type: ignore
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds,
+                                               cache_position)  # type: ignore
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+        next_decoder_cache = None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)  # type: ignore
+
+            if self.gradient_checkpointing and self.training:
+                block_outputs = self._gradient_checkpointing_func(
+                    block.__call__,
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            else:
+                block_outputs = block(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+
+            hidden_states = block_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = block_outputs[
+                    2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (block_outputs[1],)  # type: ignore
+
+            if output_router_logits:
+                all_router_logits += (block_outputs[-1],)  # type: ignore
+
+        hidden_states = self.norm_f(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)  # type: ignore
+
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()  # type: ignore
+                if isinstance(next_decoder_cache, Cache) else
+                next_decoder_cache)
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states, next_cache, all_hidden_states, all_self_attns,
+                all_router_logits
+            ] if v is not None)
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+    def _update_causal_mask(
+            self, attention_mask: Optional[torch.Tensor],
+            input_tensor: torch.Tensor,
+            cache_position: torch.Tensor) -> Optional[torch.Tensor]:
+        if self.config._attn_implementation == 'flash_attention_2':
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if hasattr(self.blocks[0].norm_attn_norm.attn,
+                   'past_key_value'):  # static cache
+            target_length = self.config.max_position_embeddings
+        else:  # dynamic cache
+            target_length = (attention_mask.shape[-1] if isinstance(
+                attention_mask, torch.Tensor) else cache_position[-1] + 1)
+        target_length = int(target_length)
+
+        causal_mask = torch.full((sequence_length, target_length),
+                                 fill_value=min_dtype,
+                                 dtype=dtype,
+                                 device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(
+            target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None,
+                                  None, :, :].expand(input_tensor.shape[0], 1,
+                                                     -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone(
+            )  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(
+                    0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[
+                    ..., :mask_length].masked_fill(padding_mask, min_dtype)
+            elif attention_mask.dim() == 4:
+                # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
+                # cache. In that case, the 4D attention mask attends to the newest tokens only.
+                if attention_mask.shape[
+                        -2] < cache_position[0] + sequence_length:
+                    offset = cache_position[0]
+                else:
+                    offset = 0
+                mask_shape = attention_mask.shape
+                mask_slice = (attention_mask.eq(0.0)).to(
+                    dtype=dtype) * min_dtype
+                causal_mask[:mask_shape[0], :mask_shape[1],
+                            offset:mask_shape[2] +
+                            offset, :mask_shape[3]] = mask_slice
+
+        if (self.config._attn_implementation == 'sdpa' and
+                attention_mask is not None and
+                attention_mask.device.type == 'cuda'):
+            # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
+            is_tracing = (
+                torch.jit.is_tracing() or
+                isinstance(input_tensor, torch.fx.Proxy) or  # type: ignore
+                (hasattr(torch, '_dynamo') and torch._dynamo.is_compiling()))
+            if not is_tracing and torch.any(attention_mask != 1):
+                # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+                # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+                # Details: https://github.com/pytorch/pytorch/issues/110213
+                causal_mask = AttentionMaskConverter._unmask_unattended(
+                    causal_mask, min_dtype)
+
+        return causal_mask
+
+
+class DbrxForCausalLM(DbrxPreTrainedModel):
+
+    def __init__(self, config: DbrxConfig):
+        super().__init__(config)
+        self.transformer = DbrxModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size,
+                                 config.vocab_size,
+                                 bias=False)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.ffn_config.moe_num_experts
+        self.num_experts_per_tok = config.ffn_config.moe_top_k
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.transformer.get_input_embeddings()
+
+    def set_input_embeddings(self, value: nn.Embedding):
+        self.transformer.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Linear:
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: nn.Linear):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder: DbrxModel):
+        self.transformer = decoder
+
+    def get_decoder(self) -> DbrxModel:
+        return self.transformer
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+        r"""Forward function for causal language modeling.
+
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, DbrxForCausalLM
+
+        >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx")
+        >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        output_router_logits = (output_router_logits
+                                if output_router_logits is not None else
+                                self.config.output_router_logits)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None and loss is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(
+                    loss.device)  # make sure to reside in the same device
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.Tensor,
+            past_key_values: Optional[Cache] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            **kwargs: Any) -> Dict[str, Any]:
+        past_length = 0
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[
+                    1] > input_ids.shape[1]:
+                input_ids = input_ids[:,
+                                      -(attention_mask.shape[1] - past_length):]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (max_cache_length is not None and attention_mask is not None and
+                    cache_length + input_ids.shape[1] > max_cache_length):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get('position_ids', None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+
+        if self.generation_config.cache_implementation == 'static':
+            # generation with static cache
+            cache_position = kwargs.get('cache_position', None)
+            if cache_position is None:
+                past_length = 0
+            else:
+                past_length = cache_position[-1] + 1
+            input_ids = input_ids[:, past_length:]
+            position_ids = position_ids[:,
+                                        past_length:] if position_ids is not None else None
+
+        # TODO @gante we should only keep a `cache_position` in generate, and do +=1.
+        # same goes for position ids. Could also help with continued generation.
+        input_length = position_ids.shape[
+            -1] if position_ids is not None else input_ids.shape[-1]
+        cache_position = torch.arange(past_length,
+                                      past_length + input_length,
+                                      device=input_ids.device)
+        position_ids = position_ids.contiguous(
+        ) if position_ids is not None else None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs = {'input_ids': input_ids.contiguous()}
+
+        model_inputs.update(
+            { # type: ignore
+                'position_ids': position_ids,
+                'cache_position': cache_position,
+                'past_key_values': past_key_values,
+                'use_cache': kwargs.get('use_cache'),
+                'attention_mask': attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values: Cache, beam_idx: torch.LongTensor):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx.to(past_state.device))
+                for past_state in layer_past),)
+        return reordered_past
+
+
+
+# @add_start_docstrings("""DBRX Model with a `language modeling` head on top. """, DBRX_START_DOCSTRING)
+# class DbrxForMaskedLM(DbrxPreTrainedModel):
+#     def __init__(self, config):
+#         super().__init__(config)
+
+#         if config.is_decoder:
+#             logger.warning(
+#                 "If you want to use `DbrxForMaskedLM` make sure `config.is_decoder=False` for "
+#                 "bi-directional self-attention."
+#             )
+
+#         self.dbrx = DbrxModel(config)
+#         self.cls = DbrxOnlyMLMHead(config)
+
+#         # Initialize weights and apply final processing
+#         self.post_init()
+
+#     def get_output_embeddings(self):
+#         return self.cls.predictions.decoder
+
+#     def set_output_embeddings(self, new_embeddings):
+#         self.cls.predictions.decoder = new_embeddings
+
+#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+#     @add_code_sample_docstrings(
+#         checkpoint=_CHECKPOINT_FOR_DOC,
+#         output_type=MaskedLMOutput,
+#         config_class=_CONFIG_FOR_DOC,
+#     )
+#     def forward(
+#         self,
+#         input_ids=None,
+#         attention_mask=None,
+#         token_type_ids=None,
+#         position_ids=None,
+#         head_mask=None,
+#         inputs_embeds=None,
+#         encoder_hidden_states=None,
+#         encoder_attention_mask=None,
+#         labels=None,
+#         output_attentions=None,
+#         output_hidden_states=None,
+#         return_dict=None,
+#     ):
+#         r"""
+#         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+#             Labels for computing the masked language modeling loss.
+#             Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring)
+#             Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels
+#             in `[0, ..., config.vocab_size]`.
+#         """
+#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+#         outputs = self.dbrx(
+#             input_ids,
+#             attention_mask=attention_mask,
+#             token_type_ids=token_type_ids,
+#             position_ids=position_ids,
+#             head_mask=head_mask,
+#             inputs_embeds=inputs_embeds,
+#             encoder_hidden_states=encoder_hidden_states,
+#             encoder_attention_mask=encoder_attention_mask,
+#             output_attentions=output_attentions,
+#             output_hidden_states=output_hidden_states,
+#             return_dict=return_dict,
+#         )
+
+#         sequence_output = outputs[0]
+#         prediction_scores = self.cls(sequence_output)
+
+#         masked_lm_loss = None
+#         if labels is not None:
+#             loss_fct = CrossEntropyLoss()  # -100 index = padding token
+#             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+#         if not return_dict:
+#             output = (prediction_scores,) + outputs[1:]
+#             return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+#         return MaskedLMOutput(
+#             loss=masked_lm_loss,
+#             logits=prediction_scores,
+#             hidden_states=outputs.hidden_states,
+#             attentions=outputs.attentions,
+#         )
+
+#     def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+#         input_shape = input_ids.shape
+#         effective_batch_size = input_shape[0]
+
+#         #  add a dummy token
+#         assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+#         attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+#         dummy_token = torch.full(
+#             (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+#         )
+#         input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+#         return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+# @add_start_docstrings(
+#     """DBRX Model with a `language modeling` head on top for CLM fine-tuning. """, DBRX_START_DOCSTRING
+# )
+# class DbrxForCausalLM(DbrxPreTrainedModel):
+
+#     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+#     def __init__(self, config):
+#         super().__init__(config)
+
+#         if not config.is_decoder:
+#             logger.warning("If you want to use `DbrxForCausalLM` as a standalone, add `is_decoder=True.`")
+
+#         self.dbrx = DbrxModel(config)
+#         self.cls = DbrxOnlyMLMHead(config)
+
+#         # Initialize weights and apply final processing
+#         self.post_init()
+
+#     def get_output_embeddings(self):
+#         return self.cls.predictions.decoder
+
+#     def set_output_embeddings(self, new_embeddings):
+#         self.cls.predictions.decoder = new_embeddings
+
+#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+#     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+#     def forward(
+#             self,
+#             input_ids=None,
+#             attention_mask=None,
+#             token_type_ids=None,
+#             position_ids=None,
+#             inputs_embeds=None,
+#             encoder_hidden_states=None,
+#             encoder_attention_mask=None,
+#             head_mask=None,
+#             cross_attn_head_mask=None,
+#             past_key_values=None,
+#             labels=None,
+#             use_cache=None,
+#             output_attentions=None,
+#             output_hidden_states=None,
+#             return_dict=None,
+#     ):
+#         r"""
+#         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+#             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+#             the model is configured as a decoder.
+#         encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+#             Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+#             the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+#             - 1 for tokens that are **not masked**,
+#             - 0 for tokens that are **masked**.
+#         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+#             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+#             tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+#             tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+#             additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
+#             model.
+
+#             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+#             cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
+#             decoding.
+
+#             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+#             (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+#             instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+#         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+#             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+#             `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+#             ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+#         use_cache (`bool`, *optional*):
+#             If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+#             decoding (see `past_key_values`).
+
+#         Returns:
+
+#         Example:
+
+#         ```python
+#         >>> from transformers import DbrxTokenizer, DbrxForCausalLM, DbrxConfig
+#         >>> import torch
+
+#         >>> tokenizer = DbrxTokenizer.from_pretrained('databricks/dbrx-instruct')
+#         >>> config = DbrxConfig.from_pretrained("databricks/dbrx-instruct")
+#         >>> config.is_decoder = True
+#         >>> model = DbrxForCausalLM.from_pretrained('databricks/dbrx-instruct', config=config)
+
+#         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+#         >>> outputs = model(**inputs)
+
+#         >>> prediction_logits = outputs.logits
+#         ```
+# """
+#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+#         outputs = self.dbrx(
+#             input_ids,
+#             attention_mask=attention_mask,
+#             token_type_ids=token_type_ids,
+#             position_ids=position_ids,
+#             head_mask=head_mask,
+#             inputs_embeds=inputs_embeds,
+#             encoder_hidden_states=encoder_hidden_states,
+#             encoder_attention_mask=encoder_attention_mask,
+#             past_key_values=past_key_values,
+#             use_cache=use_cache,
+#             output_attentions=output_attentions,
+#             output_hidden_states=output_hidden_states,
+#             return_dict=return_dict,
+#         )
+
+#         sequence_output = outputs[0]
+#         prediction_scores = self.cls(sequence_output)
+
+#         lm_loss = None
+#         if labels is not None:
+#             # we are doing next-token prediction; shift prediction scores and input ids by one
+#             shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+#             labels = labels[:, 1:].contiguous()
+#             loss_fct = CrossEntropyLoss()
+#             lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+#         if not return_dict:
+#             output = (prediction_scores,) + outputs[1:]
+#             return ((lm_loss,) + output) if lm_loss is not None else output
+
+#         return CausalLMOutputWithCrossAttentions(
+#             loss=lm_loss,
+#             logits=prediction_scores,
+#             past_key_values=outputs.past_key_values,
+#             hidden_states=outputs.hidden_states,
+#             attentions=outputs.attentions,
+#             cross_attentions=outputs.cross_attentions,
+#         )
+
+#     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+#         input_shape = input_ids.shape
+
+#         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+#         if attention_mask is None:
+#             attention_mask = input_ids.new_ones(input_shape)
+
+#         # cut decoder_input_ids if past is used
+#         if past_key_values is not None:
+#             input_ids = input_ids[:, -1:]
+
+#         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+#     def _reorder_cache(self, past_key_values, beam_idx):
+#         reordered_past = ()
+#         for layer_past in past_key_values:
+#             reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + layer_past[2:],)
+#         return reordered_past
+
+# class DbrxClassificationHead(nn.Module):
+#     """Head for sentence-level classification tasks."""
+
+#     def __init__(self, config):
+#         super().__init__()
+#         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+#         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+#         self.config = config
+
+#     def forward(self, features, **kwargs):
+#         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+#         x = self.dropout(x)
+#         x = self.dense(x)
+#         x = ACT2FN[self.config.hidden_act](x)
+#         x = self.dropout(x)
+#         x = self.out_proj(x)
+#         return x
+
+
+# @add_start_docstrings(
+#     """DBRX Model transformer with a sequence classification/regression head on top (a linear layer on top of
+#     the pooled output) e.g. for GLUE tasks. """,
+#     DBRX_START_DOCSTRING,
+# )
+# class DbrxForSequenceClassification(DbrxPreTrainedModel):
+#     def __init__(self, config):
+#         super().__init__(config)
+#         self.num_labels = config.num_labels
+#         self.dbrx = DbrxModel(config)
+#         self.classifier = DbrxClassificationHead(config)
+
+#         # Initialize weights and apply final processing
+#         self.post_init()
+
+#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+#     @add_code_sample_docstrings(
+#         checkpoint=_CHECKPOINT_FOR_DOC,
+#         output_type=SequenceClassifierOutput,
+#         config_class=_CONFIG_FOR_DOC,
+#     )
+#     def forward(
+#             self,
+#             input_ids=None,
+#             attention_mask=None,
+#             token_type_ids=None,
+#             position_ids=None,
+#             head_mask=None,
+#             inputs_embeds=None,
+#             labels=None,
+#             output_attentions=None,
+#             output_hidden_states=None,
+#             return_dict=None,
+#     ):
+#         r"""
+#         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+#             Labels for computing the sequence classification/regression loss.
+#             Indices should be in `[0, ..., config.num_labels - 1]`.
+#             If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+#             If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+#         """
+#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+#         outputs = self.dbrx(
+#             input_ids,
+#             attention_mask=attention_mask,
+#             token_type_ids=token_type_ids,
+#             position_ids=position_ids,
+#             head_mask=head_mask,
+#             inputs_embeds=inputs_embeds,
+#             output_attentions=output_attentions,
+#             output_hidden_states=output_hidden_states,
+#             return_dict=return_dict,
+#         )
+
+#         sequence_output = outputs[0]
+#         logits = self.classifier(sequence_output)
+
+#         loss = None
+#         if labels is not None:
+#             if self.config.problem_type is None:
+#                 if self.num_labels == 1:
+#                     self.config.problem_type = "regression"
+#                 elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+#                     self.config.problem_type = "single_label_classification"
+#                 else:
+#                     self.config.problem_type = "multi_label_classification"
+
+#             if self.config.problem_type == "regression":
+#                 loss_fct = MSELoss()
+#                 if self.num_labels == 1:
+#                     loss = loss_fct(logits.squeeze(), labels.squeeze())
+#                 else:
+#                     loss = loss_fct(logits, labels)
+#             elif self.config.problem_type == "single_label_classification":
+#                 loss_fct = CrossEntropyLoss()
+#                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+#             elif self.config.problem_type == "multi_label_classification":
+#                 loss_fct = BCEWithLogitsLoss()
+#                 loss = loss_fct(logits, labels)
+#         if not return_dict:
+#             output = (logits,) + outputs[1:]
+#             return ((loss,) + output) if loss is not None else output
+
+#         return SequenceClassifierOutput(
+#             loss=loss,
+#             logits=logits,
+#             hidden_states=outputs.hidden_states,
+#             attentions=outputs.attentions,
+#         )
+
+# @add_start_docstrings(
+#     """DBRX Model with a multiple choice classification head on top (a linear layer on top of
+#     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+#     DBRX_START_DOCSTRING,
+# )
+# class DbrxForMultipleChoice(DbrxPreTrainedModel):
+#     def __init__(self, config):
+#         super().__init__(config)
+
+#         self.dbrx = DbrxModel(config)
+#         self.sequence_summary = SequenceSummary(config)
+#         self.classifier = nn.Linear(config.hidden_size, 1)
+
+#         # Initialize weights and apply final processing
+#         self.post_init()
+
+#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+#     @add_code_sample_docstrings(
+#         checkpoint=_CHECKPOINT_FOR_DOC,
+#         output_type=MultipleChoiceModelOutput,
+#         config_class=_CONFIG_FOR_DOC,
+#     )
+#     def forward(
+#             self,
+#             input_ids=None,
+#             attention_mask=None,
+#             token_type_ids=None,
+#             position_ids=None,
+#             head_mask=None,
+#             inputs_embeds=None,
+#             labels=None,
+#             output_attentions=None,
+#             output_hidden_states=None,
+#             return_dict=None,
+#     ):
+#         r"""
+#         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+#             Labels for computing the multiple choice classification loss.
+#             Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension
+#             of the input tensors. (See `input_ids` above)
+#         """
+#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+#         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+#         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+#         attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+#         token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+#         position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+#         inputs_embeds = (
+#             inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+#             if inputs_embeds is not None
+#             else None
+#         )
+
+#         outputs = self.dbrx(
+#             input_ids,
+#             attention_mask=attention_mask,
+#             token_type_ids=token_type_ids,
+#             position_ids=position_ids,
+#             head_mask=head_mask,
+#             inputs_embeds=inputs_embeds,
+#             output_attentions=output_attentions,
+#             output_hidden_states=output_hidden_states,
+#             return_dict=return_dict,
+#         )
+
+#         sequence_output = outputs[0]
+
+#         pooled_output = self.sequence_summary(sequence_output)
+#         logits = self.classifier(pooled_output)
+#         reshaped_logits = logits.view(-1, num_choices)
+
+#         loss = None
+#         if labels is not None:
+#             loss_fct = CrossEntropyLoss()
+#             loss = loss_fct(reshaped_logits, labels)
+
+#         if not return_dict:
+#             output = (reshaped_logits,) + outputs[1:]
+#             return ((loss,) + output) if loss is not None else output
+
+#         return MultipleChoiceModelOutput(
+#             loss=loss,
+#             logits=reshaped_logits,
+#             hidden_states=outputs.hidden_states,
+#             attentions=outputs.attentions,
+#         )
+
+
+# @add_start_docstrings(
+#     """DBRX Model with a token classification head on top (a linear layer on top of
+#     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+#     DBRX_START_DOCSTRING,
+# )
+# class DbrxForTokenClassification(DbrxPreTrainedModel):
+#     def __init__(self, config):
+#         super().__init__(config)
+#         self.num_labels = config.num_labels
+
+#         self.dbrx = DbrxModel(config)
+#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
+#         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+#         # Initialize weights and apply final processing
+#         self.post_init()
+
+#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+#     @add_code_sample_docstrings(
+#         checkpoint=_CHECKPOINT_FOR_DOC,
+#         output_type=TokenClassifierOutput,
+#         config_class=_CONFIG_FOR_DOC,
+#     )
+#     def forward(
+#         self,
+#         input_ids=None,
+#         attention_mask=None,
+#         token_type_ids=None,
+#         position_ids=None,
+#         head_mask=None,
+#         inputs_embeds=None,
+#         labels=None,
+#         output_attentions=None,
+#         output_hidden_states=None,
+#         return_dict=None,
+#     ):
+#         r"""
+#         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+#             Labels for computing the token classification loss.
+#             Indices should be in `[0, ..., config.num_labels - 1]`.
+#         """
+#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+#         outputs = self.dbrx(
+#             input_ids,
+#             attention_mask=attention_mask,
+#             token_type_ids=token_type_ids,
+#             position_ids=position_ids,
+#             head_mask=head_mask,
+#             inputs_embeds=inputs_embeds,
+#             output_attentions=output_attentions,
+#             output_hidden_states=output_hidden_states,
+#             return_dict=return_dict,
+#         )
+
+#         sequence_output = outputs[0]
+
+#         sequence_output = self.dropout(sequence_output)
+#         logits = self.classifier(sequence_output)
+
+#         loss = None
+#         if labels is not None:
+#             loss_fct = CrossEntropyLoss()
+#             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+#         if not return_dict:
+#             output = (logits,) + outputs[1:]
+#             return ((loss,) + output) if loss is not None else output
+
+#         return TokenClassifierOutput(
+#             loss=loss,
+#             logits=logits,
+#             hidden_states=outputs.hidden_states,
+#             attentions=outputs.attentions,
+#         )
+
+
+# @add_start_docstrings(
+#     """DBRX Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+#     layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+#     DBRX_START_DOCSTRING,
+# )
+# class DbrxForQuestionAnswering(DbrxPreTrainedModel):
+#     def __init__(self, config):
+#         super().__init__(config)
+
+#         config.num_labels = 2
+#         self.num_labels = config.num_labels
+
+#         self.dbrx = DbrxModel(config)
+#         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+#         # Initialize weights and apply final processing
+#         self.post_init()
+
+#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+#     @add_code_sample_docstrings(
+#         checkpoint=_CHECKPOINT_FOR_DOC,
+#         output_type=QuestionAnsweringModelOutput,
+#         config_class=_CONFIG_FOR_DOC,
+#     )
+#     def forward(
+#         self,
+#         input_ids=None,
+#         attention_mask=None,
+#         token_type_ids=None,
+#         position_ids=None,
+#         head_mask=None,
+#         inputs_embeds=None,
+#         start_positions=None,
+#         end_positions=None,
+#         output_attentions=None,
+#         output_hidden_states=None,
+#         return_dict=None,
+#     ):
+#         r"""
+#         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+#             Labels for position (index) of the start of the labelled span for computing the token classification loss.
+#             Positions are clamped to the length of the sequence (`sequence_length`).
+#             Position outside of the sequence are not taken into account for computing the loss.
+#         end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+#             Labels for position (index) of the end of the labelled span for computing the token classification loss.
+#             Positions are clamped to the length of the sequence (`sequence_length`).
+#             Position outside of the sequence are not taken into account for computing the loss.
+#         """
+#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+#         outputs = self.dbrx(
+#             input_ids,
+#             attention_mask=attention_mask,
+#             token_type_ids=token_type_ids,
+#             position_ids=position_ids,
+#             head_mask=head_mask,
+#             inputs_embeds=inputs_embeds,
+#             output_attentions=output_attentions,
+#             output_hidden_states=output_hidden_states,
+#             return_dict=return_dict,
+#         )
+
+#         sequence_output = outputs[0]
+
+#         logits = self.qa_outputs(sequence_output)
+#         start_logits, end_logits = logits.split(1, dim=-1)
+#         start_logits = start_logits.squeeze(-1)
+#         end_logits = end_logits.squeeze(-1)
+
+#         total_loss = None
+#         if start_positions is not None and end_positions is not None:
+#             # If we are on multi-GPU, split add a dimension
+#             if len(start_positions.size()) > 1:
+#                 start_positions = start_positions.squeeze(-1)
+#             if len(end_positions.size()) > 1:
+#                 end_positions = end_positions.squeeze(-1)
+#             # sometimes the start/end positions are outside our model inputs, we ignore these terms
+#             ignored_index = start_logits.size(1)
+#             start_positions = start_positions.clamp(0, ignored_index)
+#             end_positions = end_positions.clamp(0, ignored_index)
+
+#             loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+#             start_loss = loss_fct(start_logits, start_positions)
+#             end_loss = loss_fct(end_logits, end_positions)
+#             total_loss = (start_loss + end_loss) / 2
+
+#         if not return_dict:
+#             output = (start_logits, end_logits) + outputs[1:]
+#             return ((total_loss,) + output) if total_loss is not None else output
+
+#         return QuestionAnsweringModelOutput(
+#             loss=total_loss,
+#             start_logits=start_logits,
+#             end_logits=end_logits,
+#             hidden_states=outputs.hidden_states,
+#             attentions=outputs.attentions,
+#         )
diff --git a/tests/models/dbrx/__init__.py b/tests/models/dbrx/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
new file mode 100644
index 000000000000..3f841f74cc21
--- /dev/null
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -0,0 +1,468 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch DBRX model. """
+
+
+import unittest
+
+from ...test_modeling_common import floats_tensor
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from transformers import DbrxConfig
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        DbrxForCausalLM,
+        DbrxModel,
+    )
+
+
+class DbrxModelTester:
+    def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return DbrxConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DbrxModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = DbrxModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+    ):
+        model = DbrxForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DbrxForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = DbrxForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = DbrxForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = DbrxForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = DbrxForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = DbrxForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class DbrxModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            DbrxModel,
+            DbrxForCausalLM,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (DbrxForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = DbrxModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DbrxConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    @slow
+    def test_model_from_pretrained(self):
+        model_name = "databricks/dbrx-instruct"
+        model = DbrxModel.from_pretrained(model_name)
+        self.assertIsNotNone(model)
+
+
+@require_torch
+class DbrxModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = DbrxForMaskedLM.from_pretrained("databricks/dbrx-instruct")
+        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        # TODO Replace vocab size
+        vocab_size = 32000
+
+        expected_shape = torch.Size((1, 6, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        # TODO Replace values below with what was printed above.
+        expected_slice = torch.tensor(
+            [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+

From c7dda8ca6dc39c844464b1a0e61e9be971bcaf09 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <abhi.venigalla@databricks.com>
Date: Wed, 27 Mar 2024 21:52:15 +0000
Subject: [PATCH 002/131] fix __init__.py

---
 src/transformers/models/dbrx/__init__.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/dbrx/__init__.py b/src/transformers/models/dbrx/__init__.py
index 1ee030d8af83..6d84140b8efe 100644
--- a/src/transformers/models/dbrx/__init__.py
+++ b/src/transformers/models/dbrx/__init__.py
@@ -31,16 +31,10 @@
 else:
     _import_structure["modeling_dbrx"] = [
         "DBRX_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "DbrxForMaskedLM",
         "DbrxForCausalLM",
-        "DbrxForMultipleChoice",
-        "DbrxForQuestionAnswering",
-        "DbrxForSequenceClassification",
-        "DbrxForTokenClassification",
-        "DbrxLayer",
+        "DbrxBlock",
         "DbrxModel",
         "DbrxPreTrainedModel",
-        "load_tf_weights_in_dbrx",
     ]
 
 
@@ -57,16 +51,10 @@
     else:
         from .modeling_dbrx import (
             DBRX_PRETRAINED_MODEL_ARCHIVE_LIST,
-            DbrxForMaskedLM,
             DbrxForCausalLM,
-            DbrxForMultipleChoice,
-            DbrxForQuestionAnswering,
-            DbrxForSequenceClassification,
-            DbrxForTokenClassification,
-            DbrxLayer,
+            DbrxBlock,
             DbrxModel,
             DbrxPreTrainedModel,
-            load_tf_weights_in_dbrx,
         )
 
 

From 18495d034120e30b94f71416c56a8eb03edc87e1 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <abhi.venigalla@databricks.com>
Date: Wed, 27 Mar 2024 22:13:03 +0000
Subject: [PATCH 003/131] add docs

---
 docs/source/en/model_doc/dbrx.md | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index 666a65bcf036..54417551a164 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -14,17 +14,25 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The DBRX model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>)  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+DBRX is a [transformer-based](https://www.isattentionallyouneed.com/) decoder-only large language model (LLM) that was trained using next-token prediction.
+It uses a *fine-grained* mixture-of-experts (MoE) architecture with 132B total parameters of which 36B parameters are active on any input.
+It was pre-trained on 12T tokens of text and code data.
+Compared to other open MoE models like Mixtral-8x7B and Grok-1, DBRX is fine-grained, meaning it uses a larger number of smaller experts. DBRX has 16 experts and chooses 4, while Mixtral-8x7B and Grok-1 have 8 experts and choose 2.
+This provides 65x more possible combinations of experts and we found that this improves model quality.
+DBRX uses rotary position encodings (RoPE), gated linear units (GLU), and grouped query attention (GQA).
+It uses the GPT-4 tokenizer as described in the [tiktoken](https://github.com/openai/tiktoken) repository.
+We made these choices based on exhaustive evaluation and scaling experiments.
 
-The abstract from the paper is the following:
+DBRX was pretrained on 12T tokens of carefully curated data and a maximum context length of 32K tokens.
+We estimate that this data is at least 2x better token-for-token than the data we used to pretrain the MPT family of models.
+This new dataset was developed using the full suite of Databricks tools, including Apache Spark™ and Databricks notebooks for data processing, and Unity Catalog for data management and governance.
+We used curriculum learning for pretraining, changing the data mix during training in ways we found to substantially improve model quality.
 
-*<INSERT PAPER ABSTRACT HERE>*
 
-Tips:
+More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
 
-<INSERT TIPS ABOUT MODEL HERE>
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>). The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [abhi-db](<https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx).
 
 ## DbrxConfig
 

From 292836b37219d7d1c857704478281758eca976a0 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <77638579+abhi-mosaic@users.noreply.github.com>
Date: Thu, 28 Mar 2024 11:17:18 -0700
Subject: [PATCH 004/131] Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/models/dbrx/modeling_dbrx.py | 22 ++++++-------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 9a0b62dbc355..ea8011e514ef 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -45,11 +45,8 @@
 
 _CONFIG_FOR_DOC = 'DbrxConfig'
 
-#############################################################################
-# Copied from LLaMaRotaryEmbedding
-#############################################################################
-
 
+# Copied from transformers.models.gemma.modeling_gemma. GemmaRotaryEmbedding with Gemma->Dbrx
 class DbrxRotaryEmbedding(nn.Module):
 
     def __init__(self,
@@ -89,14 +86,14 @@ def forward(
             sin = emb.sin()
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
-
+# Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x: torch.Tensor) -> torch.Tensor:
     """Rotates half the hidden dims of the input."""
     x1 = x[..., :x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2:]
     return torch.cat((-x2, x1), dim=-1)
 
-
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(
         q: torch.Tensor,
         k: torch.Tensor,
@@ -127,7 +124,7 @@ def apply_rotary_pos_emb(
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 
-
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """Equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
 
@@ -144,11 +141,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
                                  head_dim)
 
 
-#############################################################################
-
-#############################################################################
-# Modified from modeling_mixtral
-#############################################################################
 
 
 def load_balancing_loss_func(
@@ -258,7 +250,7 @@ def resolve_ffn_act_fn(
 # Copied from LLaMaAttention
 #############################################################################
 
-
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask: torch.Tensor):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
@@ -522,7 +514,7 @@ def forward(
             attn_weights = None
 
         return attn_output, attn_weights, past_key_value  # type: ignore
-
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
         self,
         query_states: torch.Tensor,
@@ -588,7 +580,7 @@ def _flash_attention_forward(
             )
 
         return attn_output
-
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
     def _upad_input(self, query_layer: torch.Tensor, key_layer: torch.Tensor,
                     value_layer: torch.Tensor, attention_mask: torch.Tensor,
                     query_length: int):

From a27c69acc56434254c8aa211777b11c1db515799 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <abhi.venigalla@databricks.com>
Date: Thu, 28 Mar 2024 21:31:08 +0000
Subject: [PATCH 005/131] address comments 1

---
 src/transformers/models/dbrx/__init__.py      |   6 +-
 .../models/dbrx/configuration_dbrx.py         |   6 +-
 src/transformers/models/dbrx/modeling_dbrx.py | 702 +-----------------
 3 files changed, 23 insertions(+), 691 deletions(-)

diff --git a/src/transformers/models/dbrx/__init__.py b/src/transformers/models/dbrx/__init__.py
index 6d84140b8efe..311983ca3087 100644
--- a/src/transformers/models/dbrx/__init__.py
+++ b/src/transformers/models/dbrx/__init__.py
@@ -20,7 +20,7 @@
 
 
 _import_structure = {
-    "configuration_dbrx": ["DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP", "DbrxConfig"],
+    "configuration_dbrx": ["DbrxConfig"],
 }
 
 try:
@@ -30,7 +30,6 @@
     pass
 else:
     _import_structure["modeling_dbrx"] = [
-        "DBRX_PRETRAINED_MODEL_ARCHIVE_LIST",
         "DbrxForCausalLM",
         "DbrxBlock",
         "DbrxModel",
@@ -41,7 +40,7 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_dbrx import DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP, DbrxConfig
+    from .configuration_dbrx import DbrxConfig
 
     try:
         if not is_torch_available():
@@ -50,7 +49,6 @@
         pass
     else:
         from .modeling_dbrx import (
-            DBRX_PRETRAINED_MODEL_ARCHIVE_LIST,
             DbrxForCausalLM,
             DbrxBlock,
             DbrxModel,
diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index be75cdceb04b..b743f69cc810 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -16,13 +16,11 @@
 
 from typing import Any, Optional
 
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
 
 logger = logging.get_logger(__name__)
 
-DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
 
 class DbrxAttentionConfig(PretrainedConfig):
     """Configuration class for Dbrx Attention.
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index ea8011e514ef..33fa189c2346 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -24,12 +24,13 @@
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import (MoeCausalLMOutputWithPast,
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_outputs import (MoeCausalLMOutputWithPast,
                                            MoeModelOutputWithPast)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import is_flash_attn_2_available, logging
+from ...modeling_utils import PreTrainedModel
+from ...utils import is_flash_attn_2_available, logging
+from ...activations import ACT2FN
 
 from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
 
@@ -223,32 +224,6 @@ def load_balancing_loss_func(
     return overall_loss * num_experts
 
 
-#############################################################################
-
-
-def resolve_ffn_act_fn(
-        ffn_act_fn: dict) -> Callable[[torch.Tensor], torch.Tensor]:
-    """Resolve the activation function for the feed-forward network.
-
-    Args:
-        ffn_act_fn (dict): The configuration dictionary for the activation function.
-            The dict config must specify the 'name' of a torch.nn.functional activation
-            function. All of other key values pairs are bound to the function as a partial.
-
-    Returns:
-        Callable[[torch.Tensor], torch.Tensor]: The activation function.
-    """
-    config = deepcopy(ffn_act_fn)
-    name = config.pop('name')
-    if not hasattr(nn.functional, name):
-        raise ValueError(f'Unrecognised activation function name ({name}).')
-    act = getattr(nn.functional, name)
-    return partial(act, **config)
-
-
-#############################################################################
-# Copied from LLaMaAttention
-#############################################################################
 
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask: torch.Tensor):
@@ -711,19 +686,12 @@ def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int,
                                self.moe_num_experts,
                                bias=False)
 
-    def jitter(self, x: torch.Tensor) -> torch.Tensor:
-        if self.moe_jitter_eps is None:
-            raise RuntimeError('The router does not have moe_jitter_eps set.')
-        low = 1.0 - self.moe_jitter_eps
-        high = 1.0 + self.moe_jitter_eps
-        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
-        return low + noise * (high - low)
 
     def forward(
             self, x: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
         if self.training and self.moe_jitter_eps is not None:
-            x = x * self.jitter(x)
+            x *= torch.empty_like(x).uniform_(1.0 - self.moe_jitter_eps, 1.0 + self.moe_jitter_eps)
 
         weights = self.layer(x.view(-1,
                                     x.shape[-1])).softmax(dim=-1,
@@ -767,7 +735,11 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int,
             torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
         self.w2 = nn.Parameter(
             torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
-        self.activation_fn = resolve_ffn_act_fn(ffn_act_fn)
+
+        act_fn_name = ffn_act_fn.pop('name')
+        if len(ffn_act_fn) != 0:
+            raise ValueError(f'FFN activation function has unhandled kwargs {ffn_act_fn=}')
+        self.activation_fn = ACT2FN[act_fn_name]
 
     def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
         expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size,
@@ -777,12 +749,12 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
         expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size,
                                  self.hidden_size)[expert_idx]
 
-        x1 = x.matmul(expert_w1.t())
-        x2 = x.matmul(expert_v1.t())
-        x1 = self.activation_fn(x1)
-        x1 = x1 * x2
-        x1 = x1.matmul(expert_w2)
-        return x1
+        gate_proj = x.matmul(expert_w1.t())
+        up_proj = x.matmul(expert_v1.t())
+        gate_proj = self.activation_fn(gate_proj)
+        intermediate_states = gate_proj * up_proj
+        down_proj = intermediate_states.matmul(expert_w2)
+        return down_proj
 
 
 class DbrxExperts(nn.Module):
@@ -852,7 +824,7 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         return out, weights
 
 
-class DbrxBlock(nn.Module):
+class DbrxDecoderLayer(nn.Module):
 
     def __init__(self, config: DbrxConfig, block_idx: int):
         super().__init__()
@@ -1486,639 +1458,3 @@ def _reorder_cache(past_key_values: Cache, beam_idx: torch.LongTensor):
                 past_state.index_select(0, beam_idx.to(past_state.device))
                 for past_state in layer_past),)
         return reordered_past
-
-
-
-# @add_start_docstrings("""DBRX Model with a `language modeling` head on top. """, DBRX_START_DOCSTRING)
-# class DbrxForMaskedLM(DbrxPreTrainedModel):
-#     def __init__(self, config):
-#         super().__init__(config)
-
-#         if config.is_decoder:
-#             logger.warning(
-#                 "If you want to use `DbrxForMaskedLM` make sure `config.is_decoder=False` for "
-#                 "bi-directional self-attention."
-#             )
-
-#         self.dbrx = DbrxModel(config)
-#         self.cls = DbrxOnlyMLMHead(config)
-
-#         # Initialize weights and apply final processing
-#         self.post_init()
-
-#     def get_output_embeddings(self):
-#         return self.cls.predictions.decoder
-
-#     def set_output_embeddings(self, new_embeddings):
-#         self.cls.predictions.decoder = new_embeddings
-
-#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-#     @add_code_sample_docstrings(
-#         checkpoint=_CHECKPOINT_FOR_DOC,
-#         output_type=MaskedLMOutput,
-#         config_class=_CONFIG_FOR_DOC,
-#     )
-#     def forward(
-#         self,
-#         input_ids=None,
-#         attention_mask=None,
-#         token_type_ids=None,
-#         position_ids=None,
-#         head_mask=None,
-#         inputs_embeds=None,
-#         encoder_hidden_states=None,
-#         encoder_attention_mask=None,
-#         labels=None,
-#         output_attentions=None,
-#         output_hidden_states=None,
-#         return_dict=None,
-#     ):
-#         r"""
-#         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-#             Labels for computing the masked language modeling loss.
-#             Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring)
-#             Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels
-#             in `[0, ..., config.vocab_size]`.
-#         """
-#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-#         outputs = self.dbrx(
-#             input_ids,
-#             attention_mask=attention_mask,
-#             token_type_ids=token_type_ids,
-#             position_ids=position_ids,
-#             head_mask=head_mask,
-#             inputs_embeds=inputs_embeds,
-#             encoder_hidden_states=encoder_hidden_states,
-#             encoder_attention_mask=encoder_attention_mask,
-#             output_attentions=output_attentions,
-#             output_hidden_states=output_hidden_states,
-#             return_dict=return_dict,
-#         )
-
-#         sequence_output = outputs[0]
-#         prediction_scores = self.cls(sequence_output)
-
-#         masked_lm_loss = None
-#         if labels is not None:
-#             loss_fct = CrossEntropyLoss()  # -100 index = padding token
-#             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
-#         if not return_dict:
-#             output = (prediction_scores,) + outputs[1:]
-#             return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-#         return MaskedLMOutput(
-#             loss=masked_lm_loss,
-#             logits=prediction_scores,
-#             hidden_states=outputs.hidden_states,
-#             attentions=outputs.attentions,
-#         )
-
-#     def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
-#         input_shape = input_ids.shape
-#         effective_batch_size = input_shape[0]
-
-#         #  add a dummy token
-#         assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
-#         attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
-#         dummy_token = torch.full(
-#             (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
-#         )
-#         input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
-#         return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-
-# @add_start_docstrings(
-#     """DBRX Model with a `language modeling` head on top for CLM fine-tuning. """, DBRX_START_DOCSTRING
-# )
-# class DbrxForCausalLM(DbrxPreTrainedModel):
-
-#     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-
-#     def __init__(self, config):
-#         super().__init__(config)
-
-#         if not config.is_decoder:
-#             logger.warning("If you want to use `DbrxForCausalLM` as a standalone, add `is_decoder=True.`")
-
-#         self.dbrx = DbrxModel(config)
-#         self.cls = DbrxOnlyMLMHead(config)
-
-#         # Initialize weights and apply final processing
-#         self.post_init()
-
-#     def get_output_embeddings(self):
-#         return self.cls.predictions.decoder
-
-#     def set_output_embeddings(self, new_embeddings):
-#         self.cls.predictions.decoder = new_embeddings
-
-#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-#     @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
-#     def forward(
-#             self,
-#             input_ids=None,
-#             attention_mask=None,
-#             token_type_ids=None,
-#             position_ids=None,
-#             inputs_embeds=None,
-#             encoder_hidden_states=None,
-#             encoder_attention_mask=None,
-#             head_mask=None,
-#             cross_attn_head_mask=None,
-#             past_key_values=None,
-#             labels=None,
-#             use_cache=None,
-#             output_attentions=None,
-#             output_hidden_states=None,
-#             return_dict=None,
-#     ):
-#         r"""
-#         encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-#             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-#             the model is configured as a decoder.
-#         encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-#             Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-#             the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-
-#             - 1 for tokens that are **not masked**,
-#             - 0 for tokens that are **masked**.
-#         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-#             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
-#             tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
-#             tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
-#             additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
-#             model.
-
-#             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-#             cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
-#             decoding.
-
-#             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
-#             (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
-#             instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-#         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-#             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-#             `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
-#             ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
-#         use_cache (`bool`, *optional*):
-#             If set to `True`, `past_key_values` key value states are returned and can be used to speed up
-#             decoding (see `past_key_values`).
-
-#         Returns:
-
-#         Example:
-
-#         ```python
-#         >>> from transformers import DbrxTokenizer, DbrxForCausalLM, DbrxConfig
-#         >>> import torch
-
-#         >>> tokenizer = DbrxTokenizer.from_pretrained('databricks/dbrx-instruct')
-#         >>> config = DbrxConfig.from_pretrained("databricks/dbrx-instruct")
-#         >>> config.is_decoder = True
-#         >>> model = DbrxForCausalLM.from_pretrained('databricks/dbrx-instruct', config=config)
-
-#         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-#         >>> outputs = model(**inputs)
-
-#         >>> prediction_logits = outputs.logits
-#         ```
-# """
-#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-#         outputs = self.dbrx(
-#             input_ids,
-#             attention_mask=attention_mask,
-#             token_type_ids=token_type_ids,
-#             position_ids=position_ids,
-#             head_mask=head_mask,
-#             inputs_embeds=inputs_embeds,
-#             encoder_hidden_states=encoder_hidden_states,
-#             encoder_attention_mask=encoder_attention_mask,
-#             past_key_values=past_key_values,
-#             use_cache=use_cache,
-#             output_attentions=output_attentions,
-#             output_hidden_states=output_hidden_states,
-#             return_dict=return_dict,
-#         )
-
-#         sequence_output = outputs[0]
-#         prediction_scores = self.cls(sequence_output)
-
-#         lm_loss = None
-#         if labels is not None:
-#             # we are doing next-token prediction; shift prediction scores and input ids by one
-#             shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
-#             labels = labels[:, 1:].contiguous()
-#             loss_fct = CrossEntropyLoss()
-#             lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
-#         if not return_dict:
-#             output = (prediction_scores,) + outputs[1:]
-#             return ((lm_loss,) + output) if lm_loss is not None else output
-
-#         return CausalLMOutputWithCrossAttentions(
-#             loss=lm_loss,
-#             logits=prediction_scores,
-#             past_key_values=outputs.past_key_values,
-#             hidden_states=outputs.hidden_states,
-#             attentions=outputs.attentions,
-#             cross_attentions=outputs.cross_attentions,
-#         )
-
-#     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
-#         input_shape = input_ids.shape
-
-#         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-#         if attention_mask is None:
-#             attention_mask = input_ids.new_ones(input_shape)
-
-#         # cut decoder_input_ids if past is used
-#         if past_key_values is not None:
-#             input_ids = input_ids[:, -1:]
-
-#         return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
-
-#     def _reorder_cache(self, past_key_values, beam_idx):
-#         reordered_past = ()
-#         for layer_past in past_key_values:
-#             reordered_past += (tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + layer_past[2:],)
-#         return reordered_past
-
-# class DbrxClassificationHead(nn.Module):
-#     """Head for sentence-level classification tasks."""
-
-#     def __init__(self, config):
-#         super().__init__()
-#         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-#         self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
-
-#         self.config = config
-
-#     def forward(self, features, **kwargs):
-#         x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
-#         x = self.dropout(x)
-#         x = self.dense(x)
-#         x = ACT2FN[self.config.hidden_act](x)
-#         x = self.dropout(x)
-#         x = self.out_proj(x)
-#         return x
-
-
-# @add_start_docstrings(
-#     """DBRX Model transformer with a sequence classification/regression head on top (a linear layer on top of
-#     the pooled output) e.g. for GLUE tasks. """,
-#     DBRX_START_DOCSTRING,
-# )
-# class DbrxForSequenceClassification(DbrxPreTrainedModel):
-#     def __init__(self, config):
-#         super().__init__(config)
-#         self.num_labels = config.num_labels
-#         self.dbrx = DbrxModel(config)
-#         self.classifier = DbrxClassificationHead(config)
-
-#         # Initialize weights and apply final processing
-#         self.post_init()
-
-#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-#     @add_code_sample_docstrings(
-#         checkpoint=_CHECKPOINT_FOR_DOC,
-#         output_type=SequenceClassifierOutput,
-#         config_class=_CONFIG_FOR_DOC,
-#     )
-#     def forward(
-#             self,
-#             input_ids=None,
-#             attention_mask=None,
-#             token_type_ids=None,
-#             position_ids=None,
-#             head_mask=None,
-#             inputs_embeds=None,
-#             labels=None,
-#             output_attentions=None,
-#             output_hidden_states=None,
-#             return_dict=None,
-#     ):
-#         r"""
-#         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-#             Labels for computing the sequence classification/regression loss.
-#             Indices should be in `[0, ..., config.num_labels - 1]`.
-#             If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-#             If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-#         """
-#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-#         outputs = self.dbrx(
-#             input_ids,
-#             attention_mask=attention_mask,
-#             token_type_ids=token_type_ids,
-#             position_ids=position_ids,
-#             head_mask=head_mask,
-#             inputs_embeds=inputs_embeds,
-#             output_attentions=output_attentions,
-#             output_hidden_states=output_hidden_states,
-#             return_dict=return_dict,
-#         )
-
-#         sequence_output = outputs[0]
-#         logits = self.classifier(sequence_output)
-
-#         loss = None
-#         if labels is not None:
-#             if self.config.problem_type is None:
-#                 if self.num_labels == 1:
-#                     self.config.problem_type = "regression"
-#                 elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-#                     self.config.problem_type = "single_label_classification"
-#                 else:
-#                     self.config.problem_type = "multi_label_classification"
-
-#             if self.config.problem_type == "regression":
-#                 loss_fct = MSELoss()
-#                 if self.num_labels == 1:
-#                     loss = loss_fct(logits.squeeze(), labels.squeeze())
-#                 else:
-#                     loss = loss_fct(logits, labels)
-#             elif self.config.problem_type == "single_label_classification":
-#                 loss_fct = CrossEntropyLoss()
-#                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-#             elif self.config.problem_type == "multi_label_classification":
-#                 loss_fct = BCEWithLogitsLoss()
-#                 loss = loss_fct(logits, labels)
-#         if not return_dict:
-#             output = (logits,) + outputs[1:]
-#             return ((loss,) + output) if loss is not None else output
-
-#         return SequenceClassifierOutput(
-#             loss=loss,
-#             logits=logits,
-#             hidden_states=outputs.hidden_states,
-#             attentions=outputs.attentions,
-#         )
-
-# @add_start_docstrings(
-#     """DBRX Model with a multiple choice classification head on top (a linear layer on top of
-#     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-#     DBRX_START_DOCSTRING,
-# )
-# class DbrxForMultipleChoice(DbrxPreTrainedModel):
-#     def __init__(self, config):
-#         super().__init__(config)
-
-#         self.dbrx = DbrxModel(config)
-#         self.sequence_summary = SequenceSummary(config)
-#         self.classifier = nn.Linear(config.hidden_size, 1)
-
-#         # Initialize weights and apply final processing
-#         self.post_init()
-
-#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
-#     @add_code_sample_docstrings(
-#         checkpoint=_CHECKPOINT_FOR_DOC,
-#         output_type=MultipleChoiceModelOutput,
-#         config_class=_CONFIG_FOR_DOC,
-#     )
-#     def forward(
-#             self,
-#             input_ids=None,
-#             attention_mask=None,
-#             token_type_ids=None,
-#             position_ids=None,
-#             head_mask=None,
-#             inputs_embeds=None,
-#             labels=None,
-#             output_attentions=None,
-#             output_hidden_states=None,
-#             return_dict=None,
-#     ):
-#         r"""
-#         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-#             Labels for computing the multiple choice classification loss.
-#             Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension
-#             of the input tensors. (See `input_ids` above)
-#         """
-#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-#         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
-
-#         input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
-#         attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-#         token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-#         position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-#         inputs_embeds = (
-#             inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
-#             if inputs_embeds is not None
-#             else None
-#         )
-
-#         outputs = self.dbrx(
-#             input_ids,
-#             attention_mask=attention_mask,
-#             token_type_ids=token_type_ids,
-#             position_ids=position_ids,
-#             head_mask=head_mask,
-#             inputs_embeds=inputs_embeds,
-#             output_attentions=output_attentions,
-#             output_hidden_states=output_hidden_states,
-#             return_dict=return_dict,
-#         )
-
-#         sequence_output = outputs[0]
-
-#         pooled_output = self.sequence_summary(sequence_output)
-#         logits = self.classifier(pooled_output)
-#         reshaped_logits = logits.view(-1, num_choices)
-
-#         loss = None
-#         if labels is not None:
-#             loss_fct = CrossEntropyLoss()
-#             loss = loss_fct(reshaped_logits, labels)
-
-#         if not return_dict:
-#             output = (reshaped_logits,) + outputs[1:]
-#             return ((loss,) + output) if loss is not None else output
-
-#         return MultipleChoiceModelOutput(
-#             loss=loss,
-#             logits=reshaped_logits,
-#             hidden_states=outputs.hidden_states,
-#             attentions=outputs.attentions,
-#         )
-
-
-# @add_start_docstrings(
-#     """DBRX Model with a token classification head on top (a linear layer on top of
-#     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-#     DBRX_START_DOCSTRING,
-# )
-# class DbrxForTokenClassification(DbrxPreTrainedModel):
-#     def __init__(self, config):
-#         super().__init__(config)
-#         self.num_labels = config.num_labels
-
-#         self.dbrx = DbrxModel(config)
-#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-#         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-#         # Initialize weights and apply final processing
-#         self.post_init()
-
-#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-#     @add_code_sample_docstrings(
-#         checkpoint=_CHECKPOINT_FOR_DOC,
-#         output_type=TokenClassifierOutput,
-#         config_class=_CONFIG_FOR_DOC,
-#     )
-#     def forward(
-#         self,
-#         input_ids=None,
-#         attention_mask=None,
-#         token_type_ids=None,
-#         position_ids=None,
-#         head_mask=None,
-#         inputs_embeds=None,
-#         labels=None,
-#         output_attentions=None,
-#         output_hidden_states=None,
-#         return_dict=None,
-#     ):
-#         r"""
-#         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-#             Labels for computing the token classification loss.
-#             Indices should be in `[0, ..., config.num_labels - 1]`.
-#         """
-#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-#         outputs = self.dbrx(
-#             input_ids,
-#             attention_mask=attention_mask,
-#             token_type_ids=token_type_ids,
-#             position_ids=position_ids,
-#             head_mask=head_mask,
-#             inputs_embeds=inputs_embeds,
-#             output_attentions=output_attentions,
-#             output_hidden_states=output_hidden_states,
-#             return_dict=return_dict,
-#         )
-
-#         sequence_output = outputs[0]
-
-#         sequence_output = self.dropout(sequence_output)
-#         logits = self.classifier(sequence_output)
-
-#         loss = None
-#         if labels is not None:
-#             loss_fct = CrossEntropyLoss()
-#             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-#         if not return_dict:
-#             output = (logits,) + outputs[1:]
-#             return ((loss,) + output) if loss is not None else output
-
-#         return TokenClassifierOutput(
-#             loss=loss,
-#             logits=logits,
-#             hidden_states=outputs.hidden_states,
-#             attentions=outputs.attentions,
-#         )
-
-
-# @add_start_docstrings(
-#     """DBRX Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-#     layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
-#     DBRX_START_DOCSTRING,
-# )
-# class DbrxForQuestionAnswering(DbrxPreTrainedModel):
-#     def __init__(self, config):
-#         super().__init__(config)
-
-#         config.num_labels = 2
-#         self.num_labels = config.num_labels
-
-#         self.dbrx = DbrxModel(config)
-#         self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-#         # Initialize weights and apply final processing
-#         self.post_init()
-
-#     @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-#     @add_code_sample_docstrings(
-#         checkpoint=_CHECKPOINT_FOR_DOC,
-#         output_type=QuestionAnsweringModelOutput,
-#         config_class=_CONFIG_FOR_DOC,
-#     )
-#     def forward(
-#         self,
-#         input_ids=None,
-#         attention_mask=None,
-#         token_type_ids=None,
-#         position_ids=None,
-#         head_mask=None,
-#         inputs_embeds=None,
-#         start_positions=None,
-#         end_positions=None,
-#         output_attentions=None,
-#         output_hidden_states=None,
-#         return_dict=None,
-#     ):
-#         r"""
-#         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-#             Labels for position (index) of the start of the labelled span for computing the token classification loss.
-#             Positions are clamped to the length of the sequence (`sequence_length`).
-#             Position outside of the sequence are not taken into account for computing the loss.
-#         end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-#             Labels for position (index) of the end of the labelled span for computing the token classification loss.
-#             Positions are clamped to the length of the sequence (`sequence_length`).
-#             Position outside of the sequence are not taken into account for computing the loss.
-#         """
-#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-#         outputs = self.dbrx(
-#             input_ids,
-#             attention_mask=attention_mask,
-#             token_type_ids=token_type_ids,
-#             position_ids=position_ids,
-#             head_mask=head_mask,
-#             inputs_embeds=inputs_embeds,
-#             output_attentions=output_attentions,
-#             output_hidden_states=output_hidden_states,
-#             return_dict=return_dict,
-#         )
-
-#         sequence_output = outputs[0]
-
-#         logits = self.qa_outputs(sequence_output)
-#         start_logits, end_logits = logits.split(1, dim=-1)
-#         start_logits = start_logits.squeeze(-1)
-#         end_logits = end_logits.squeeze(-1)
-
-#         total_loss = None
-#         if start_positions is not None and end_positions is not None:
-#             # If we are on multi-GPU, split add a dimension
-#             if len(start_positions.size()) > 1:
-#                 start_positions = start_positions.squeeze(-1)
-#             if len(end_positions.size()) > 1:
-#                 end_positions = end_positions.squeeze(-1)
-#             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-#             ignored_index = start_logits.size(1)
-#             start_positions = start_positions.clamp(0, ignored_index)
-#             end_positions = end_positions.clamp(0, ignored_index)
-
-#             loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-#             start_loss = loss_fct(start_logits, start_positions)
-#             end_loss = loss_fct(end_logits, end_positions)
-#             total_loss = (start_loss + end_loss) / 2
-
-#         if not return_dict:
-#             output = (start_logits, end_logits) + outputs[1:]
-#             return ((total_loss,) + output) if total_loss is not None else output
-
-#         return QuestionAnsweringModelOutput(
-#             loss=total_loss,
-#             start_logits=start_logits,
-#             end_logits=end_logits,
-#             hidden_states=outputs.hidden_states,
-#             attentions=outputs.attentions,
-#         )

From 5417623c302baaa200f6c8db6be0cd53635b7686 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <abhi.venigalla@databricks.com>
Date: Thu, 28 Mar 2024 21:51:09 +0000
Subject: [PATCH 006/131] work on make fixup

---
 src/transformers/__init__.py                  |  37 +-
 src/transformers/models/__init__.py           |   1 +
 src/transformers/models/dbrx/__init__.py      |  10 +-
 .../models/dbrx/configuration_dbrx.py         |  76 +-
 src/transformers/models/dbrx/modeling_dbrx.py | 759 ++++++++----------
 src/transformers/utils/dummy_pt_objects.py    |  28 +
 tests/models/dbrx/test_modeling_dbrx.py       | 162 ++--
 7 files changed, 450 insertions(+), 623 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4af0be04cb61..3494821816c2 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -131,7 +131,6 @@
     ],
     "models": [],
     # Models
-    "models.dbrx": ["DbrxConfig"],
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.align": [
         "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -328,6 +327,7 @@
         "Data2VecTextConfig",
         "Data2VecVisionConfig",
     ],
+    "models.dbrx": ["DbrxConfig"],
     "models.deberta": [
         "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "DebertaConfig",
@@ -1443,15 +1443,6 @@
 
     # PyTorch models structure
 
-    _import_structure["models.dbrx"].extend(
-        [
-            "DbrxForCausalLM",
-            "DbrxBlock",
-            "DbrxModel",
-            "DbrxPreTrainedModel",
-        ]
-    )
-
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1476,6 +1467,7 @@
             "AlignVisionModel",
         ]
     )
+
     _import_structure["models.altclip"].extend(
         [
             "ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1943,6 +1935,14 @@
             "Data2VecVisionPreTrainedModel",
         ]
     )
+    _import_structure["models.dbrx"].extend(
+        [
+            "DbrxBlock",
+            "DbrxForCausalLM",
+            "DbrxModel",
+            "DbrxPreTrainedModel",
+        ]
+    )
     _import_structure["models.deberta"].extend(
         [
             "DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -6735,6 +6735,14 @@
             Data2VecVisionModel,
             Data2VecVisionPreTrainedModel,
         )
+
+        # PyTorch model imports
+        from .models.dbrx import (
+            DbrxBlock,
+            DbrxForCausalLM,
+            DbrxModel,
+            DbrxPreTrainedModel,
+        )
         from .models.deberta import (
             DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             DebertaForMaskedLM,
@@ -7794,15 +7802,6 @@
             SamModel,
             SamPreTrainedModel,
         )
-
-        # PyTorch model imports
-
-        from .models.dbrx import (
-            DbrxForCausalLM,
-            DbrxBlock,
-            DbrxModel,
-            DbrxPreTrainedModel,
-        )
         from .models.seamless_m4t import (
             SEAMLESS_M4T_PRETRAINED_MODEL_ARCHIVE_LIST,
             SeamlessM4TCodeHifiGan,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 0599d3b876e6..7ac3017b49af 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -59,6 +59,7 @@
     ctrl,
     cvt,
     data2vec,
+    dbrx,
     deberta,
     deberta_v2,
     decision_transformer,
diff --git a/src/transformers/models/dbrx/__init__.py b/src/transformers/models/dbrx/__init__.py
index 311983ca3087..7660c376198e 100644
--- a/src/transformers/models/dbrx/__init__.py
+++ b/src/transformers/models/dbrx/__init__.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import  _LazyModule, OptionalDependencyNotAvailable
-from ...utils import is_torch_available
-
-
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
@@ -37,8 +34,6 @@
     ]
 
 
-
-
 if TYPE_CHECKING:
     from .configuration_dbrx import DbrxConfig
 
@@ -49,14 +44,13 @@
         pass
     else:
         from .modeling_dbrx import (
-            DbrxForCausalLM,
             DbrxBlock,
+            DbrxForCausalLM,
             DbrxModel,
             DbrxPreTrainedModel,
         )
 
 
-
 else:
     import sys
 
diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index b743f69cc810..048526d4ac93 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -19,6 +19,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -54,30 +55,25 @@ def __init__(
         self.kv_n_heads = kv_n_heads
         self.rope_theta = rope_theta
 
-        for k in ['model_type']:
+        for k in ["model_type"]:
             if k in kwargs:
                 kwargs.pop(k)
         if len(kwargs) != 0:
-            raise ValueError(f'Found unknown {kwargs=}')
+            raise ValueError(f"Found unknown {kwargs=}")
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str,
-                        **kwargs: Any) -> 'PretrainedConfig':
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
         cls._set_token_in_kwargs(kwargs)
 
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        if config_dict.get('model_type') == 'dbrx':
-            config_dict = config_dict['attn_config']
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["attn_config"]
 
-        if 'model_type' in config_dict and hasattr(
-                cls,
-                'model_type') and config_dict['model_type'] != cls.model_type:
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                +
-                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
             )
 
         return cls.from_dict(config_dict, **kwargs)
@@ -120,7 +116,7 @@ def __init__(
     ):
         super().__init__()
         if ffn_act_fn is None:
-            ffn_act_fn = {'name': 'silu'}
+            ffn_act_fn = {"name": "silu"}
         self.ffn_act_fn = ffn_act_fn
         self.ffn_hidden_size = ffn_hidden_size
         self.moe_num_experts = moe_num_experts
@@ -130,30 +126,25 @@ def __init__(
         self.moe_normalize_expert_weights = moe_normalize_expert_weights
         self.uniform_expert_assignment = uniform_expert_assignment
 
-        for k in ['model_type']:
+        for k in ["model_type"]:
             if k in kwargs:
                 kwargs.pop(k)
         if len(kwargs) != 0:
-            raise ValueError(f'Found unknown {kwargs=}')
+            raise ValueError(f"Found unknown {kwargs=}")
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str,
-                        **kwargs: Any) -> 'PretrainedConfig':
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig":
         cls._set_token_in_kwargs(kwargs)
 
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
-                                                  **kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        if config_dict.get('model_type') == 'dbrx':
-            config_dict = config_dict['ffn_config']
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["ffn_config"]
 
-        if 'model_type' in config_dict and hasattr(
-                cls,
-                'model_type') and config_dict['model_type'] != cls.model_type:
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                +
-                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
             )
 
         return cls.from_dict(config_dict, **kwargs)
@@ -170,15 +161,15 @@ class DbrxConfig(PretrainedConfig):
 
 
     Args:
-        d_model (`int`, *optional*, defaults to 6144):
+        d_model (`int`, *optional*, defaults to 2048):
             Dimensionality of the embeddings and hidden states.
-        n_heads (`int`, *optional*, defaults to 48):
+        n_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        n_layers (`int`, *optional*, defaults to 40):
+        n_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        max_seq_len (`int`, *optional*, defaults to 32768):
+        max_seq_len (`int`, *optional*, defaults to 2048):
             The maximum sequence length of the model.
-        vocab_size (`int`, *optional*, defaults to 100352):
+        vocab_size (`int`, *optional*, defaults to 32000):
             Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
             the `inputs_ids` passed when calling [`DbrxModel`].
         resid_pdrop (`float`, *optional*, defaults to 0.0):
@@ -189,14 +180,14 @@ class DbrxConfig(PretrainedConfig):
             A dictionary used to configure the model's attention module.
         ffn_config (`dict`, *optional*):
             A dictionary used to configure the model's FFN module.
-        use_cache (`bool`, *optional*, defaults to `False`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         output_router_logits (`bool`, *optional*, defaults to `False`):
             Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss. See [here]() for more details
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.05):
             The aux loss factor for the total loss.
 
 
@@ -215,12 +206,12 @@ class DbrxConfig(PretrainedConfig):
     ```
     """
 
-    model_type = 'dbrx'
+    model_type = "dbrx"
     attribute_map = {
-        'num_attention_heads': 'n_heads',
-        'hidden_size': 'd_model',
-        'num_hidden_layers': 'n_layers',
-        'max_position_embeddings': 'max_seq_len'
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+        "max_position_embeddings": "max_seq_len",
     }
 
     def __init__(
@@ -266,12 +257,11 @@ def __init__(
         self.output_router_logits = output_router_logits
         self.router_aux_loss_coef = router_aux_loss_coef
 
-        tie_word_embeddings = kwargs.pop('tie_word_embeddings', False)
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
         if tie_word_embeddings:
-            raise ValueError(
-                'tie_word_embeddings is not supported for Dbrx models.')
+            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
 
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
-        )
\ No newline at end of file
+        )
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 33fa189c2346..ed1502f1e671 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -16,91 +16,76 @@
 
 import math
 import warnings
-from copy import deepcopy
-from functools import partial
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
+
+from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_outputs import (MoeCausalLMOutputWithPast,
-                                           MoeModelOutputWithPast)
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...utils import is_flash_attn_2_available, logging
-from ...activations import ACT2FN
-
 from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
 
+
 if is_flash_attn_2_available():
-    try:
-        from flash_attn import flash_attn_func, flash_attn_varlen_func
-        from flash_attn.bert_padding import pad_input  # noqa
-        from flash_attn.bert_padding import index_first_axis, unpad_input
-    except:
-        pass
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,  # noqa
+        unpad_input,
+    )
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = 'DbrxConfig'
+_CONFIG_FOR_DOC = "DbrxConfig"
 
 
-# Copied from transformers.models.gemma.modeling_gemma. GemmaRotaryEmbedding with Gemma->Dbrx
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with Gemma->Dbrx
 class DbrxRotaryEmbedding(nn.Module):
-
-    def __init__(self,
-                 dim: int,
-                 max_position_embeddings: int = 2048,
-                 base: float = 10000.0,
-                 scaling_factor: float = 1.0):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
-        self.scaling_factor = scaling_factor
+
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base**(
-            torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
-        self.register_buffer('inv_freq', inv_freq, persistent=False)
-        # For BC we register cos and sin cached
-        self.max_seq_len_cached = max_position_embeddings
+        self.register_buffer("inv_freq", None, persistent=False)
 
     @torch.no_grad()
-    def forward(
-            self, x: torch.Tensor, position_ids: torch.LongTensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, x, position_ids, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
-            position_ids.shape[0], -1, 1)
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+            )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
         device_type = x.device.type
-        device_type = device_type if isinstance(
-            device_type, str) and device_type != 'mps' else 'cpu'
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float()
-                     @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos()
             sin = emb.sin()
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
+
 # Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x: torch.Tensor) -> torch.Tensor:
+def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
+
 # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(
-        q: torch.Tensor,
-        k: torch.Tensor,
-        cos: torch.Tensor,
-        sin: torch.Tensor,
-        unsqueeze_dim: int = 1) -> Tuple[torch.Tensor, torch.Tensor]:
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -108,14 +93,15 @@ def apply_rotary_pos_emb(
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos and
-            sin so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos and sin have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos and sin broadcastable to the shapes of q and k. Similarly, if q and k have
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
@@ -125,23 +111,18 @@ def apply_rotary_pos_emb(
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 
+
 # Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """Equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
-
-    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
     """
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
         return hidden_states
-    hidden_states = hidden_states[:, :,
-                                  None, :, :].expand(batch, num_key_value_heads,
-                                                     n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen,
-                                 head_dim)
-
-
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
 def load_balancing_loss_func(
@@ -176,12 +157,9 @@ def load_balancing_loss_func(
 
     if isinstance(gate_logits, tuple):
         compute_device = gate_logits[0].device
-        concatenated_gate_logits = torch.cat(
-            [layer_gate.to(compute_device) for layer_gate in gate_logits],
-            dim=0)
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits,
-                                                  dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -195,43 +173,44 @@ def load_balancing_loss_func(
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
         batch_size, sequence_length = attention_mask.shape
-        num_hidden_layers = concatenated_gate_logits.shape[0] // (
-            batch_size * sequence_length)
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
-        expert_attention_mask = (attention_mask[None, :, :, None, None].expand(
-            (num_hidden_layers, batch_size, sequence_length, top_k,
-             num_experts)).reshape(-1, top_k, num_experts).to(compute_device))
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(
-            expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-                expert_attention_mask, dim=0)
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
         router_per_expert_attention_mask = (
-            attention_mask[None, :, :, None].expand(
-                (num_hidden_layers, batch_size, sequence_length,
-                 num_experts)).reshape(-1, num_experts).to(compute_device))
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
 
         # Compute the average probability of routing to these experts
-        router_prob_per_expert = torch.sum(
-            routing_weights * router_per_expert_attention_mask,
-            dim=0) / torch.sum(router_per_expert_attention_mask, dim=0)
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
 
-    overall_loss = torch.sum(tokens_per_expert *
-                             router_prob_per_expert.unsqueeze(0))
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
     return overall_loss * num_experts
 
 
-
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask: torch.Tensor):
+def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32),
-                       (1, 0))
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
@@ -242,12 +221,14 @@ def _get_unpad_data(attention_mask: torch.Tensor):
 class DbrxAttention(nn.Module):
     """Multi-head self attention."""
 
-    def __init__(self,
-                 hidden_size: int,
-                 num_heads: int,
-                 max_position_embeddings: int,
-                 attn_config: DbrxAttentionConfig,
-                 block_idx: Optional[int] = None):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        max_position_embeddings: int,
+        attn_config: DbrxAttentionConfig,
+        block_idx: Optional[int] = None,
+    ):
         super().__init__()
         self.hidden_size = hidden_size
         self.num_heads = num_heads
@@ -257,10 +238,10 @@ def __init__(self,
         self.config = attn_config
         if block_idx is None:
             logger.warning_once(
-                f'Instantiating {self.__class__.__name__} without passing a `block_idx` is not recommended and will '
-                +
-                'lead to errors during the forward call if caching is used. Please make sure to provide a `block_idx` '
-                + 'when creating this class.')
+                f"Instantiating {self.__class__.__name__} without passing a `block_idx` is not recommended and will "
+                + "lead to errors during the forward call if caching is used. Please make sure to provide a `block_idx` "
+                + "when creating this class."
+            )
 
         self.attn_pdrop = attn_config.attn_pdrop
         self.clip_qkv = attn_config.clip_qkv
@@ -268,13 +249,10 @@ def __init__(self,
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.rope_theta = attn_config.rope_theta
 
-        self.Wqkv = nn.Linear(self.hidden_size,
-                              self.hidden_size +
-                              2 * self.num_key_value_heads * self.head_dim,
-                              bias=False)
-        self.out_proj = nn.Linear(self.hidden_size,
-                                  self.hidden_size,
-                                  bias=False)
+        self.Wqkv = nn.Linear(
+            self.hidden_size, self.hidden_size + 2 * self.num_key_value_heads * self.head_dim, bias=False
+        )
+        self.out_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
         self.rotary_emb = DbrxRotaryEmbedding(
             self.head_dim,
             max_position_embeddings=self.max_position_embeddings,
@@ -307,52 +285,38 @@ def forward(
             dim=2,
         )
 
-        query_states = query_states.view(bsz, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        past_key_value = getattr(self, 'past_key_value', past_key_value)
+        past_key_value = getattr(self, "past_key_value", past_key_value)
         cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states,
-                                                        key_states, cos, sin)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; position_ids needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.block_idx, cache_kwargs)
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.block_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) / math.sqrt(self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, :key_states.shape[-2]]
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights,
-                                             p=self.attn_pdrop,
-                                             training=self.training)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attn_pdrop, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
-                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
-                + f' {attn_output.size()}')
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                + f" {attn_output.size()}"
+            )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
@@ -374,9 +338,7 @@ class DbrxFlashAttention2(DbrxAttention):
 
     def __init__(self, *args: Any, **kwargs: Any):
         if not is_flash_attn_2_available():
-            raise ImportError(
-                'Flash Attention 2 is not available. Please install it with `pip install flash-attn`.'
-            )
+            raise ImportError("Flash Attention 2 is not available. Please install it with `pip install flash-attn`.")
 
         super().__init__(*args, **kwargs)
 
@@ -390,11 +352,8 @@ def forward(
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Any,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        logger.info(
-            'Implicitly setting `output_attentions` to False as it is not supported in Flash Attention.'
-        )
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        logger.info("Implicitly setting `output_attentions` to False as it is not supported in Flash Attention.")
         output_attentions = False
 
         bsz, q_len, _ = hidden_states.size()
@@ -415,28 +374,19 @@ def forward(
         # Flash attention requires the input to have the shape
         # batch_size x seq_length x head_dim x hidden_dim
         # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads,
-                                         self.head_dim).transpose(1, 2)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
         cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states,
-                                                        key_states, cos, sin)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
-        past_key_value = getattr(self, 'past_key_value', past_key_value)
+        past_key_value = getattr(self, "past_key_value", past_key_value)
 
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {
-                'sin': sin,
-                'cos': cos,
-                'cache_position': cache_position
-            }
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.block_idx, cache_kwargs)
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.block_idx, cache_kwargs)
 
         # TODO: These transpose are quite inefficient but Flash Attention requires the layout
         # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
@@ -457,16 +407,16 @@ def forward(
             if torch.is_autocast_enabled():
                 target_dtype = torch.get_autocast_gpu_dtype()
             # Handle the case where the model is quantized
-            elif hasattr(self.config, '_pre_quantization_dtype'):
+            elif hasattr(self.config, "_pre_quantization_dtype"):
                 target_dtype = self.config._pre_quantization_dtype
             else:
                 target_dtype = query_states.dtype
 
             logger.warning_once(
-                f'The input hidden states seems to be silently casted in float32, this might be '
-                +
-                f'related to the fact you have upcasted embedding or layer norm layers in '
-                + f'float32. We will cast back the input in {target_dtype}.')
+                "The input hidden states seems to be silently casted in float32, this might be "
+                + "related to the fact you have upcasted embedding or layer norm layers in "
+                + f"float32. We will cast back the input in {target_dtype}."
+            )
 
             query_states = query_states.to(target_dtype)
             key_states = key_states.to(target_dtype)
@@ -481,46 +431,49 @@ def forward(
             dropout=dropout_rate,
         )
 
-        attn_output = attn_output.reshape(bsz, q_len,
-                                          self.hidden_size).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
         attn_output = self.out_proj(attn_output)
 
         if not output_attentions:
             attn_weights = None
 
         return attn_output, attn_weights, past_key_value  # type: ignore
+
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
-        self,
-        query_states: torch.Tensor,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        attention_mask: Union[torch.LongTensor, None],
-        query_length: int,
-        dropout: float = 0.0,
-        softmax_scale: Optional[float] = None,
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
-        """Use FlashAttention, stripping padding tokens if necessary.
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
 
         Args:
-            query_states (torch.Tensor): Input query states to be passed to Flash Attention API
-            key_states (torch.Tensor): Input key states to be passed to Flash Attention API
-            value_states (torch.Tensor): Input value states to be passed to Flash Attention API
-            attention_mask (torch.LongTensor | None): The padding mask - corresponds to a tensor of size
-                (batch_size, seq_len) where 0 stands for the position of padding tokens and 1
-                for the position of non-padding tokens.
-            query_length (int): The length of the query sequence
-            dropout (float): Attention dropout
-            softmax_scale (float, optional): The scaling of QK^T before applying softmax.
-                Defaults to 1 / sqrt(head_dim)
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
-        causal = True
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
             query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask,
-                query_length)
+                query_states, key_states, value_states, attention_mask, query_length
+            )
 
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
@@ -538,41 +491,29 @@ def _flash_attention_forward(
                 causal=causal,
             )
 
-            attn_output = pad_input(
-                attn_output_unpad,
-                indices_q,
-                batch_size,
-                query_length,
-            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
+
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
-    def _upad_input(self, query_layer: torch.Tensor, key_layer: torch.Tensor,
-                    value_layer: torch.Tensor, attention_mask: torch.Tensor,
-                    query_length: int):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
-            attention_mask)
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
         key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                              head_dim), indices_k)
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
         value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                                head_dim), indices_k)
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
-                                    head_dim), indices_k)
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
             indices_q = indices_k
@@ -586,8 +527,7 @@ def _upad_input(self, query_layer: torch.Tensor, key_layer: torch.Tensor,
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
-                query_layer, attention_mask)
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
 
         return (
             query_layer,
@@ -600,13 +540,12 @@ def _upad_input(self, query_layer: torch.Tensor, key_layer: torch.Tensor,
 
 
 DBRX_ATTENTION_CLASSES = {
-    'eager': DbrxAttention,
-    'flash_attention_2': DbrxFlashAttention2,
+    "eager": DbrxAttention,
+    "flash_attention_2": DbrxFlashAttention2,
 }
 
 
 class DbrxNormAttentionNorm(nn.Module):
-
     def __init__(
         self,
         hidden_size: int,
@@ -640,9 +579,7 @@ def forward(
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Any,
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
-               Optional[Cache]]:
-
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
         residual_states = hidden_states
         hidden_states = self.norm_1(hidden_states).to(hidden_states.dtype)
 
@@ -657,9 +594,7 @@ def forward(
             **kwargs,
         )
 
-        hidden_states = nn.functional.dropout(hidden_states,
-                                              p=self.resid_pdrop,
-                                              training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.resid_pdrop, training=self.training)
         hidden_states = hidden_states + residual_states
 
         residual_states = hidden_states
@@ -669,11 +604,15 @@ def forward(
 
 
 class DbrxRouter(nn.Module):
-
-    def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int,
-                 moe_jitter_eps: Optional[float],
-                 moe_normalize_expert_weights: Optional[float],
-                 uniform_expert_assignment: bool):
+    def __init__(
+        self,
+        hidden_size: int,
+        moe_num_experts: int,
+        moe_top_k: int,
+        moe_jitter_eps: Optional[float],
+        moe_normalize_expert_weights: Optional[float],
+        uniform_expert_assignment: bool,
+    ):
         super().__init__()
         self.hidden_size = hidden_size
         self.moe_num_experts = moe_num_experts
@@ -682,36 +621,26 @@ def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int,
         self.moe_normalize_expert_weights = moe_normalize_expert_weights
         self.uniform_expert_assignment = uniform_expert_assignment
 
-        self.layer = nn.Linear(self.hidden_size,
-                               self.moe_num_experts,
-                               bias=False)
+        self.layer = nn.Linear(self.hidden_size, self.moe_num_experts, bias=False)
 
-
-    def forward(
-            self, x: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
         if self.training and self.moe_jitter_eps is not None:
             x *= torch.empty_like(x).uniform_(1.0 - self.moe_jitter_eps, 1.0 + self.moe_jitter_eps)
 
-        weights = self.layer(x.view(-1,
-                                    x.shape[-1])).softmax(dim=-1,
-                                                          dtype=torch.float32)
+        weights = self.layer(x.view(-1, x.shape[-1])).softmax(dim=-1, dtype=torch.float32)
         top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
 
         if self.moe_normalize_expert_weights:
             top_weights = top_weights / torch.norm(
-                top_weights,
-                p=self.moe_normalize_expert_weights,
-                dim=-1,
-                keepdim=True)
+                top_weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True
+            )
 
         if self.uniform_expert_assignment:
             with torch.no_grad():
-                uniform_tensor = torch.arange(
-                    0,
-                    top_experts.numel(),
-                    device=top_experts.device,
-                    dtype=top_experts.dtype) % self.moe_num_experts
+                uniform_tensor = (
+                    torch.arange(0, top_experts.numel(), device=top_experts.device, dtype=top_experts.dtype)
+                    % self.moe_num_experts
+                )
                 top_experts = uniform_tensor.reshape(top_experts.shape)
                 # Note, weights and top_weights are not changed
 
@@ -721,33 +650,25 @@ def forward(
 
 
 class DbrxExpertGLU(nn.Module):
-
-    def __init__(self, hidden_size: int, ffn_hidden_size: int,
-                 moe_num_experts: int, ffn_act_fn: dict):
+    def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict):
         super().__init__()
         self.hidden_size = hidden_size
         self.ffn_hidden_size = ffn_hidden_size
         self.moe_num_experts = moe_num_experts
 
-        self.w1 = nn.Parameter(
-            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
-        self.v1 = nn.Parameter(
-            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
-        self.w2 = nn.Parameter(
-            torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.w1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.v1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.w2 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
 
-        act_fn_name = ffn_act_fn.pop('name')
+        act_fn_name = ffn_act_fn.pop("name")
         if len(ffn_act_fn) != 0:
-            raise ValueError(f'FFN activation function has unhandled kwargs {ffn_act_fn=}')
+            raise ValueError(f"FFN activation function has unhandled kwargs {ffn_act_fn=}")
         self.activation_fn = ACT2FN[act_fn_name]
 
     def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
-        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size,
-                                 self.hidden_size)[expert_idx]
-        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size,
-                                 self.hidden_size)[expert_idx]
-        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size,
-                                 self.hidden_size)[expert_idx]
+        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
 
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())
@@ -758,25 +679,24 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
 
 
 class DbrxExperts(nn.Module):
-
-    def __init__(self, hidden_size: int, ffn_hidden_size: int,
-                 moe_num_experts: int, ffn_act_fn: dict):
+    def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict):
         super().__init__()
         self.moe_num_experts = moe_num_experts
-        self.mlp = DbrxExpertGLU(hidden_size=hidden_size,
-                                 ffn_hidden_size=ffn_hidden_size,
-                                 moe_num_experts=moe_num_experts,
-                                 ffn_act_fn=ffn_act_fn)
-
-    def forward(self, x: torch.Tensor, weights: torch.Tensor,
-                top_weights: torch.Tensor,
-                top_experts: torch.LongTensor) -> torch.Tensor:
+        self.mlp = DbrxExpertGLU(
+            hidden_size=hidden_size,
+            ffn_hidden_size=ffn_hidden_size,
+            moe_num_experts=moe_num_experts,
+            ffn_act_fn=ffn_act_fn,
+        )
+
+    def forward(
+        self, x: torch.Tensor, weights: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor
+    ) -> torch.Tensor:
         bsz, q_len, hidden_size = x.shape
         x = x.view(-1, hidden_size)
         out = torch.zeros_like(x)
 
-        expert_mask = nn.functional.one_hot(
-            top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
+        expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
         for expert_idx in range(0, self.moe_num_experts):
             topk_idx, token_idx = torch.where(expert_mask[expert_idx])
             if token_idx.shape[0] == 0:
@@ -786,9 +706,7 @@ def forward(self, x: torch.Tensor, weights: torch.Tensor,
             topk_list = topk_idx.tolist()
 
             expert_tokens = x[None, token_list].reshape(-1, hidden_size)
-            expert_out = self.mlp(
-                expert_tokens, expert_idx) * top_weights[token_list, topk_list,
-                                                         None]
+            expert_out = self.mlp(expert_tokens, expert_idx) * top_weights[token_list, topk_list, None]
 
             out.index_add_(0, token_idx, expert_out)
 
@@ -797,7 +715,6 @@ def forward(self, x: torch.Tensor, weights: torch.Tensor,
 
 
 class DbrxFFN(nn.Module):
-
     def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig):
         super().__init__()
 
@@ -806,8 +723,7 @@ def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig):
             moe_num_experts=ffn_config.moe_num_experts,
             moe_top_k=ffn_config.moe_top_k,
             moe_jitter_eps=ffn_config.moe_jitter_eps,
-            moe_normalize_expert_weights=ffn_config.
-            moe_normalize_expert_weights,
+            moe_normalize_expert_weights=ffn_config.moe_normalize_expert_weights,
             uniform_expert_assignment=ffn_config.uniform_expert_assignment,
         )
 
@@ -824,8 +740,7 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         return out, weights
 
 
-class DbrxDecoderLayer(nn.Module):
-
+class DbrxBlock(nn.Module):
     def __init__(self, config: DbrxConfig, block_idx: int):
         super().__init__()
         self.hidden_size = config.d_model
@@ -840,8 +755,7 @@ def __init__(self, config: DbrxConfig, block_idx: int):
             attn_config=config.attn_config,
             block_idx=block_idx,
         )
-        self.ffn = DbrxFFN(hidden_size=config.d_model,
-                           ffn_config=config.ffn_config)
+        self.ffn = DbrxFFN(hidden_size=config.d_model, ffn_config=config.ffn_config)
 
     def forward(
         self,
@@ -854,14 +768,15 @@ def forward(
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Any,
-    ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, Optional[torch.Tensor]],
-               Tuple[torch.Tensor, Optional[Cache]], Tuple[
-                   torch.Tensor, Optional[torch.Tensor], Optional[Cache]],
-               Tuple[torch.Tensor, Optional[torch.Tensor],
-                     Optional[torch.Tensor]], Tuple[
-                         torch.Tensor, Optional[Cache], Optional[torch.Tensor]],
-               Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache],
-                     Optional[torch.Tensor]],]:
+    ) -> Union[
+        Tuple[torch.Tensor],
+        Tuple[torch.Tensor, Optional[torch.Tensor]],
+        Tuple[torch.Tensor, Optional[Cache]],
+        Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]],
+        Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]],
+        Tuple[torch.Tensor, Optional[Cache], Optional[torch.Tensor]],
+        Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache], Optional[torch.Tensor]],
+    ]:
         """Forward function for DbrxBlock.
 
         Args:
@@ -878,9 +793,9 @@ def forward(
                 returned and can be used to speed up decoding (see `past_key_values`).
             cache_position (`torch.LongTensor`, optional): position ids of the cache
         """
-        if 'padding_mask' in kwargs:
+        if "padding_mask" in kwargs:
             warnings.warn(
-                'Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`'
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
 
         # Norm + Attention + Norm
@@ -897,9 +812,7 @@ def forward(
 
         # Fully Connected
         hidden_states, router_logits = self.ffn(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states,
-                                              p=self.resid_pdrop,
-                                              training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.resid_pdrop, training=self.training)
         hidden_states = resid_states + hidden_states
 
         outputs = (hidden_states,)
@@ -918,10 +831,10 @@ def forward(
 
 class DbrxPreTrainedModel(PreTrainedModel):
     config_class = DbrxConfig
-    base_model_prefix = 'transformer'
+    base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
-    _no_split_modules = ['DbrxBlock']
-    _skip_keys_device_placement = ['past_key_values']
+    _no_split_modules = ["DbrxBlock"]
+    _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
     _supports_sdpa = False
     _supports_cache_class = True
@@ -945,26 +858,25 @@ def _init_weights(self, module: nn.Module):
             module.v1.data.normal_(mean=0.0, std=std)
             module.w2.data.normal_(mean=0.0, std=std)
 
-    def _setup_cache(self, cache_cls: Any, max_batch_size: int,
-                     max_cache_len: int):  # TODO: how to set var type of class?
-        if self.config._attn_implementation == 'flash_attention_2' and cache_cls == StaticCache:
+    def _setup_cache(
+        self, cache_cls: Any, max_batch_size: int, max_cache_len: int
+    ):  # TODO: how to set var type of class?
+        if self.config._attn_implementation == "flash_attention_2" and cache_cls == StaticCache:
             raise ValueError(
-                '`static` cache implementation is not compatible with ' +
-                '`attn_implementation==flash_attention_2`. Make sure to use ' +
-                '`spda` in the mean time and open an issue at https://github.com/huggingface/transformers.'
+                "`static` cache implementation is not compatible with "
+                + "`attn_implementation==flash_attention_2`. Make sure to use "
+                + "`spda` in the mean time and open an issue at https://github.com/huggingface/transformers."
             )
 
         for block in self.transformer.blocks:
             device = block.norm_attn_norm.norm_1.weight.device
-            if hasattr(self.config, '_pre_quantization_dtype'):
+            if hasattr(self.config, "_pre_quantization_dtype"):
                 dtype = self.config._pre_quantization_dtype
             else:
                 dtype = block.norm_attn_norm.attn.out_proj.weight.dtype
-            block.norm_attn_norm.attn.past_key_value = cache_cls(self.config,
-                                                                 max_batch_size,
-                                                                 max_cache_len,
-                                                                 device=device,
-                                                                 dtype=dtype)
+            block.norm_attn_norm.attn.past_key_value = cache_cls(
+                self.config, max_batch_size, max_cache_len, device=device, dtype=dtype
+            )
 
     def _reset_cache(self):
         for block in self.transformer.blocks:
@@ -986,11 +898,8 @@ def __init__(self, config: DbrxConfig):
         self.vocab_size = config.vocab_size
         self.emb_pdrop = config.emb_pdrop
 
-        self.wte = nn.Embedding(config.vocab_size, config.d_model,
-                                self.padding_idx)
-        self.blocks = nn.ModuleList([
-            DbrxBlock(config, block_idx) for block_idx in range(config.n_layers)
-        ])
+        self.wte = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+        self.blocks = nn.ModuleList([DbrxBlock(config, block_idx) for block_idx in range(config.n_layers)])
         self.norm_f = nn.LayerNorm(config.d_model, bias=False)
         self.gradient_checkpointing = False
 
@@ -1003,12 +912,10 @@ def get_input_embeddings(self) -> nn.Embedding:
     def set_input_embeddings(self, value: nn.Embedding):
         self.wte = value
 
-    def _autocast_input_embeddings(self,
-                                   inputs_embeds: torch.Tensor) -> torch.Tensor:
-        if inputs_embeds.device.type == 'cuda' and torch.is_autocast_enabled():
+    def _autocast_input_embeddings(self, inputs_embeds: torch.Tensor) -> torch.Tensor:
+        if inputs_embeds.device.type == "cuda" and torch.is_autocast_enabled():
             return inputs_embeds.to(dtype=torch.get_autocast_gpu_dtype())
-        elif inputs_embeds.device.type == 'cpu' and torch.is_autocast_cpu_enabled(
-        ):
+        elif inputs_embeds.device.type == "cpu" and torch.is_autocast_cpu_enabled():
             return inputs_embeds.to(dtype=torch.get_autocast_cpu_dtype())
         else:
             return inputs_embeds
@@ -1028,58 +935,50 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, MoeModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        output_router_logits = (output_router_logits
-                                if output_router_logits is not None else
-                                self.config.output_router_logits)
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError(
-                'You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one'
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
-                '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.'
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
             )
             use_cache = False
 
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
 
-        inputs_embeds = self._autocast_input_embeddings(
-            inputs_embeds)  # type: ignore
-        inputs_embeds = nn.functional.dropout(inputs_embeds,
-                                              p=self.emb_pdrop,
-                                              training=self.training)
+        inputs_embeds = self._autocast_input_embeddings(inputs_embeds)  # type: ignore
+        inputs_embeds = nn.functional.dropout(inputs_embeds, p=self.emb_pdrop, training=self.training)
 
         past_seen_tokens = 0
         if use_cache:  # kept for BC (cache positions)
             if not isinstance(past_key_values, StaticCache):
-                past_key_values = DynamicCache.from_legacy_cache(
-                    past_key_values)
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 past_seen_tokens = past_key_values.get_seq_length(  # type: ignore
                 )
 
         if cache_position is None:
             if isinstance(past_key_values, StaticCache):
-                raise ValueError(
-                    'cache_position is a required argument when using StaticCache.'
-                )
+                raise ValueError("cache_position is a required argument when using StaticCache.")
             cache_position = torch.arange(  # type: ignore
-                past_seen_tokens,
-                past_seen_tokens + inputs_embeds.shape[1],
-                device=inputs_embeds.device)
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
 
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)  # type: ignore
 
-        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds,
-                                               cache_position)  # type: ignore
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)  # type: ignore
 
         # embed positions
         hidden_states = inputs_embeds
@@ -1121,8 +1020,7 @@ def forward(
             hidden_states = block_outputs[0]
 
             if use_cache:
-                next_decoder_cache = block_outputs[
-                    2 if output_attentions else 1]
+                next_decoder_cache = block_outputs[2 if output_attentions else 1]
 
             if output_attentions:
                 all_self_attns += (block_outputs[1],)  # type: ignore
@@ -1140,13 +1038,15 @@ def forward(
         if use_cache:
             next_cache = (
                 next_decoder_cache.to_legacy_cache()  # type: ignore
-                if isinstance(next_decoder_cache, Cache) else
-                next_decoder_cache)
+                if isinstance(next_decoder_cache, Cache)
+                else next_decoder_cache
+            )
         if not return_dict:
-            return tuple(v for v in [
-                hidden_states, next_cache, all_hidden_states, all_self_attns,
-                all_router_logits
-            ] if v is not None)
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                if v is not None
+            )
         return MoeModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -1160,10 +1060,9 @@ def forward(
     # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
     # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
     def _update_causal_mask(
-            self, attention_mask: Optional[torch.Tensor],
-            input_tensor: torch.Tensor,
-            cache_position: torch.Tensor) -> Optional[torch.Tensor]:
-        if self.config._attn_implementation == 'flash_attention_2':
+        self, attention_mask: Optional[torch.Tensor], input_tensor: torch.Tensor, cache_position: torch.Tensor
+    ) -> Optional[torch.Tensor]:
+        if self.config._attn_implementation == "flash_attention_2":
             if attention_mask is not None and 0.0 in attention_mask:
                 return attention_mask
             return None
@@ -1171,76 +1070,64 @@ def _update_causal_mask(
         dtype, device = input_tensor.dtype, input_tensor.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = input_tensor.shape[1]
-        if hasattr(self.blocks[0].norm_attn_norm.attn,
-                   'past_key_value'):  # static cache
+        if hasattr(self.blocks[0].norm_attn_norm.attn, "past_key_value"):  # static cache
             target_length = self.config.max_position_embeddings
         else:  # dynamic cache
-            target_length = (attention_mask.shape[-1] if isinstance(
-                attention_mask, torch.Tensor) else cache_position[-1] + 1)
+            target_length = (
+                attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1
+            )
         target_length = int(target_length)
 
-        causal_mask = torch.full((sequence_length, target_length),
-                                 fill_value=min_dtype,
-                                 dtype=dtype,
-                                 device=device)
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
         if sequence_length != 1:
             causal_mask = torch.triu(causal_mask, diagonal=1)
-        causal_mask *= torch.arange(
-            target_length, device=device) > cache_position.reshape(-1, 1)
-        causal_mask = causal_mask[None,
-                                  None, :, :].expand(input_tensor.shape[0], 1,
-                                                     -1, -1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
         if attention_mask is not None:
-            causal_mask = causal_mask.clone(
-            )  # copy to contiguous memory for in-place edit
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
             if attention_mask.dim() == 2:
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[..., :mask_length].eq(
-                    0.0) * attention_mask[:, None, None, :].eq(0.0)
-                causal_mask[..., :mask_length] = causal_mask[
-                    ..., :mask_length].masked_fill(padding_mask, min_dtype)
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
             elif attention_mask.dim() == 4:
                 # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
                 # cache. In that case, the 4D attention mask attends to the newest tokens only.
-                if attention_mask.shape[
-                        -2] < cache_position[0] + sequence_length:
+                if attention_mask.shape[-2] < cache_position[0] + sequence_length:
                     offset = cache_position[0]
                 else:
                     offset = 0
                 mask_shape = attention_mask.shape
-                mask_slice = (attention_mask.eq(0.0)).to(
-                    dtype=dtype) * min_dtype
-                causal_mask[:mask_shape[0], :mask_shape[1],
-                            offset:mask_shape[2] +
-                            offset, :mask_shape[3]] = mask_slice
-
-        if (self.config._attn_implementation == 'sdpa' and
-                attention_mask is not None and
-                attention_mask.device.type == 'cuda'):
+                mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
+                causal_mask[
+                    : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
+                ] = mask_slice
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+        ):
             # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
             is_tracing = (
-                torch.jit.is_tracing() or
-                isinstance(input_tensor, torch.fx.Proxy) or  # type: ignore
-                (hasattr(torch, '_dynamo') and torch._dynamo.is_compiling()))
+                torch.jit.is_tracing()
+                or isinstance(input_tensor, torch.fx.Proxy)  # type: ignore
+                or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
+            )
             if not is_tracing and torch.any(attention_mask != 1):
                 # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
                 # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
                 # Details: https://github.com/pytorch/pytorch/issues/110213
-                causal_mask = AttentionMaskConverter._unmask_unattended(
-                    causal_mask, min_dtype)
+                causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
 
         return causal_mask
 
 
 class DbrxForCausalLM(DbrxPreTrainedModel):
-
     def __init__(self, config: DbrxConfig):
         super().__init__(config)
         self.transformer = DbrxModel(config)
         self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size,
-                                 config.vocab_size,
-                                 bias=False)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.router_aux_loss_coef = config.router_aux_loss_coef
         self.num_experts = config.ffn_config.moe_num_experts
         self.num_experts_per_tok = config.ffn_config.moe_top_k
@@ -1300,12 +1187,12 @@ def forward(
         ```
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        output_router_logits = (output_router_logits
-                                if output_router_logits is not None else
-                                self.config.output_router_logits)
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
@@ -1348,8 +1235,7 @@ def forward(
                 attention_mask,
             )
             if labels is not None and loss is not None:
-                loss += self.router_aux_loss_coef * aux_loss.to(
-                    loss.device)  # make sure to reside in the same device
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
 
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -1366,12 +1252,13 @@ def forward(
         )
 
     def prepare_inputs_for_generation(
-            self,
-            input_ids: torch.Tensor,
-            past_key_values: Optional[Cache] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            **kwargs: Any) -> Dict[str, Any]:
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
         past_length = 0
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
@@ -1386,10 +1273,8 @@ def prepare_inputs_for_generation(
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
-            if attention_mask is not None and attention_mask.shape[
-                    1] > input_ids.shape[1]:
-                input_ids = input_ids[:,
-                                      -(attention_mask.shape[1] - past_length):]
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
@@ -1397,55 +1282,53 @@ def prepare_inputs_for_generation(
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
 
             # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (max_cache_length is not None and attention_mask is not None and
-                    cache_length + input_ids.shape[1] > max_cache_length):
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
                 attention_mask = attention_mask[:, -max_cache_length:]
 
-        position_ids = kwargs.get('position_ids', None)
+        position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
             if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1]:]
+                position_ids = position_ids[:, -input_ids.shape[1] :]
 
-        if self.generation_config.cache_implementation == 'static':
+        if self.generation_config.cache_implementation == "static":
             # generation with static cache
-            cache_position = kwargs.get('cache_position', None)
+            cache_position = kwargs.get("cache_position", None)
             if cache_position is None:
                 past_length = 0
             else:
                 past_length = cache_position[-1] + 1
             input_ids = input_ids[:, past_length:]
-            position_ids = position_ids[:,
-                                        past_length:] if position_ids is not None else None
+            position_ids = position_ids[:, past_length:] if position_ids is not None else None
 
         # TODO @gante we should only keep a `cache_position` in generate, and do +=1.
         # same goes for position ids. Could also help with continued generation.
-        input_length = position_ids.shape[
-            -1] if position_ids is not None else input_ids.shape[-1]
-        cache_position = torch.arange(past_length,
-                                      past_length + input_length,
-                                      device=input_ids.device)
-        position_ids = position_ids.contiguous(
-        ) if position_ids is not None else None
+        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
+        cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
+        position_ids = position_ids.contiguous() if position_ids is not None else None
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {'inputs_embeds': inputs_embeds}
+            model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
             # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
             # TODO: use `next_tokens` directly instead.
-            model_inputs = {'input_ids': input_ids.contiguous()}
+            model_inputs = {"input_ids": input_ids.contiguous()}
 
         model_inputs.update(
-            { # type: ignore
-                'position_ids': position_ids,
-                'cache_position': cache_position,
-                'past_key_values': past_key_values,
-                'use_cache': kwargs.get('use_cache'),
-                'attention_mask': attention_mask,
+            {  # type: ignore
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
             }
         )
         return model_inputs
@@ -1454,7 +1337,7 @@ def prepare_inputs_for_generation(
     def _reorder_cache(past_key_values: Cache, beam_idx: torch.LongTensor):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past),)
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
         return reordered_past
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1bdab80a13f6..dc2a2d2a4f60 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2457,6 +2457,34 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class DbrxBlock(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DbrxForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DbrxModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DbrxPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 3f841f74cc21..c5a6c5a9cbde 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -17,13 +17,11 @@
 
 import unittest
 
-from ...test_modeling_common import floats_tensor
-from transformers import is_torch_available
+from transformers import DbrxConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
-from transformers import DbrxConfig
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 
 
 if is_torch_available():
@@ -37,29 +35,29 @@
 
 class DbrxModelTester:
     def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -151,7 +149,7 @@ def prepare_config_and_inputs_for_decoder(self):
         )
 
     def create_and_check_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
         model = DbrxModel(config=config)
         model.to(torch_device)
@@ -162,16 +160,16 @@ def create_and_check_model(
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def create_and_check_model_as_decoder(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
     ):
         config.add_cross_attention = True
         model = DbrxModel(config)
@@ -194,16 +192,16 @@ def create_and_check_model_as_decoder(
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def create_and_check_for_causal_lm(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
     ):
         model = DbrxForCausalLM(config=config)
         model.to(torch_device)
@@ -211,15 +209,6 @@ def create_and_check_for_causal_lm(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
-    def create_and_check_for_masked_lm(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DbrxForMaskedLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
     def create_and_check_decoder_model_past_large_inputs(
         self,
         config,
@@ -282,60 +271,6 @@ def create_and_check_decoder_model_past_large_inputs(
         # test that outputs are equal for slice
         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
-    def create_and_check_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = DbrxForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = DbrxForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = DbrxForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = DbrxForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -353,7 +288,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class DbrxModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             DbrxModel,
@@ -448,7 +382,7 @@ def test_model_from_pretrained(self):
 class DbrxModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
-        model = DbrxForMaskedLM.from_pretrained("databricks/dbrx-instruct")
+        model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
 
@@ -464,5 +398,3 @@ def test_inference_masked_lm(self):
         )
 
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
-
-

From 46b45c16a978ce91df4832b0cb78872199c5a5f4 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <abhi.venigalla@databricks.com>
Date: Thu, 28 Mar 2024 22:20:55 +0000
Subject: [PATCH 007/131] pass configs down

---
 src/transformers/models/dbrx/modeling_dbrx.py | 52 +++++++------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index ed1502f1e671..6e2d64dfe47e 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -29,7 +29,7 @@
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...utils import is_flash_attn_2_available, logging
-from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
+from .configuration_dbrx import DbrxConfig
 
 
 if is_flash_attn_2_available():
@@ -223,19 +223,16 @@ class DbrxAttention(nn.Module):
 
     def __init__(
         self,
-        hidden_size: int,
-        num_heads: int,
-        max_position_embeddings: int,
-        attn_config: DbrxAttentionConfig,
+        config: DbrxConfig,
         block_idx: Optional[int] = None,
     ):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
+        self.config = config
+        self.hidden_size = config.d_model
+        self.num_heads = config.n_heads
         self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = max_position_embeddings
+        self.max_position_embeddings = config.max_seq_len
         self.block_idx = block_idx
-        self.config = attn_config
         if block_idx is None:
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `block_idx` is not recommended and will "
@@ -243,6 +240,7 @@ def __init__(
                 + "when creating this class."
             )
 
+        attn_config = config.attn_config
         self.attn_pdrop = attn_config.attn_pdrop
         self.clip_qkv = attn_config.clip_qkv
         self.num_key_value_heads = attn_config.kv_n_heads
@@ -548,26 +546,18 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
 class DbrxNormAttentionNorm(nn.Module):
     def __init__(
         self,
-        hidden_size: int,
-        num_heads: int,
-        max_position_embeddings: int,
-        resid_pdrop: float,
-        attn_implementation: str,
-        attn_config: DbrxAttentionConfig,
+        config: DbrxConfig,
         block_idx: Optional[int] = None,
     ):
         super().__init__()
         self.block_idx = block_idx
-        self.resid_pdrop = resid_pdrop
-        self.norm_1 = nn.LayerNorm(hidden_size, bias=False)
-        self.attn = DBRX_ATTENTION_CLASSES[attn_implementation](
-            hidden_size=hidden_size,
-            num_heads=num_heads,
-            max_position_embeddings=max_position_embeddings,
-            attn_config=attn_config,
+        self.resid_pdrop = config.resid_pdrop
+        self.norm_1 = nn.LayerNorm(config.d_model, bias=False)
+        self.attn = DBRX_ATTENTION_CLASSES[config._attn_implementation](
+            config=config,
             block_idx=block_idx,
         )
-        self.norm_2 = nn.LayerNorm(hidden_size, bias=False)
+        self.norm_2 = nn.LayerNorm(config.d_model, bias=False)
 
     def forward(
         self,
@@ -715,11 +705,12 @@ def forward(
 
 
 class DbrxFFN(nn.Module):
-    def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig):
+    def __init__(self, config: DbrxConfig):
         super().__init__()
 
+        ffn_config = config.ffn_config
         self.router = DbrxRouter(
-            hidden_size,
+            hidden_size=config.d_model,
             moe_num_experts=ffn_config.moe_num_experts,
             moe_top_k=ffn_config.moe_top_k,
             moe_jitter_eps=ffn_config.moe_jitter_eps,
@@ -728,7 +719,7 @@ def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig):
         )
 
         self.experts = DbrxExperts(
-            hidden_size=hidden_size,
+            hidden_size=config.d_model,
             ffn_hidden_size=ffn_config.ffn_hidden_size,
             moe_num_experts=ffn_config.moe_num_experts,
             ffn_act_fn=ffn_config.ffn_act_fn,
@@ -747,15 +738,10 @@ def __init__(self, config: DbrxConfig, block_idx: int):
         self.resid_pdrop = config.resid_pdrop
         self.block_idx = block_idx
         self.norm_attn_norm = DbrxNormAttentionNorm(
-            hidden_size=config.d_model,
-            num_heads=config.n_heads,
-            max_position_embeddings=config.max_seq_len,
-            resid_pdrop=config.resid_pdrop,
-            attn_implementation=config._attn_implementation,
-            attn_config=config.attn_config,
+            config=config,
             block_idx=block_idx,
         )
-        self.ffn = DbrxFFN(hidden_size=config.d_model, ffn_config=config.ffn_config)
+        self.ffn = DbrxFFN(config=config)
 
     def forward(
         self,

From 76c2e9c5a92a1ff01bb2c68e15cc2da881bd2c12 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <abhi.venigalla@databricks.com>
Date: Thu, 28 Mar 2024 23:07:04 +0000
Subject: [PATCH 008/131] add sdpa attention

---
 src/transformers/models/dbrx/modeling_dbrx.py | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 6e2d64dfe47e..401bb8393848 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -537,9 +537,103 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         )
 
 
+class DbrxSdpaAttention(DbrxAttention):
+    """
+    Dbrx attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DbrxAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Ignore copy
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "GemmaModel is using GemmaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv_states = self.Wqkv(hidden_states)
+        if self.clip_qkv is not None:
+            qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+        query_states, key_states, value_states = qkv_states.split(
+            [
+                self.hidden_size,
+                self.num_key_value_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim,
+            ],
+            dim=2,
+        )
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
+
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attn_pdrop if self.training else 0.0,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
 DBRX_ATTENTION_CLASSES = {
     "eager": DbrxAttention,
     "flash_attention_2": DbrxFlashAttention2,
+    "sdpa": DbrxSdpaAttention,
 }
 
 

From 4e74661e87c45e1ff8c1fa972db18e43e6dbcf9a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 14:02:39 +0000
Subject: [PATCH 009/131] remove DbrxBlock

---
 src/transformers/__init__.py               | 88 ++++++++++++----------
 src/transformers/models/dbrx/__init__.py   |  8 +-
 src/transformers/utils/dummy_pt_objects.py |  7 --
 3 files changed, 49 insertions(+), 54 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3494821816c2..b22bf2cf2fd4 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1937,7 +1937,6 @@
     )
     _import_structure["models.dbrx"].extend(
         [
-            "DbrxBlock",
             "DbrxForCausalLM",
             "DbrxModel",
             "DbrxPreTrainedModel",
@@ -4616,9 +4615,7 @@
     if not is_torchaudio_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils import (
-        dummy_torchaudio_objects,
-    )
+    from .utils import dummy_torchaudio_objects
 
     _import_structure["utils.dummy_torchaudio_objects"] = [
         name for name in dir(dummy_torchaudio_objects) if not name.startswith("_")
@@ -5275,7 +5272,10 @@
         TransfoXLTokenizer,
     )
     from .models.deprecated.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
-    from .models.depth_anything import DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP, DepthAnythingConfig
+    from .models.depth_anything import (
+        DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DepthAnythingConfig,
+    )
     from .models.deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig
@@ -5331,7 +5331,11 @@
         FastSpeech2ConformerTokenizer,
         FastSpeech2ConformerWithHifiGanConfig,
     )
-    from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
+    from .models.flaubert import (
+        FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        FlaubertConfig,
+        FlaubertTokenizer,
+    )
     from .models.flava import (
         FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
         FlavaConfig,
@@ -5395,10 +5399,7 @@
     from .models.herbert import HerbertTokenizer
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
-    from .models.idefics import (
-        IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        IdeficsConfig,
-    )
+    from .models.idefics import IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP, IdeficsConfig
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
     from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
     from .models.instructblip import (
@@ -5627,7 +5628,11 @@
     from .models.pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig
     from .models.pvt_v2 import PvtV2Config
     from .models.qdqbert import QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, QDQBertConfig
-    from .models.qwen2 import QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP, Qwen2Config, Qwen2Tokenizer
+    from .models.qwen2 import (
+        QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Qwen2Config,
+        Qwen2Tokenizer,
+    )
     from .models.qwen2_moe import QWEN2MOE_PRETRAINED_CONFIG_ARCHIVE_MAP, Qwen2MoeConfig
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
     from .models.realm import (
@@ -5677,7 +5682,10 @@
         SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         SeamlessM4Tv2Config,
     )
-    from .models.segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig
+    from .models.segformer import (
+        SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SegformerConfig,
+    )
     from .models.seggpt import SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, SegGptConfig
     from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
     from .models.sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig
@@ -5720,8 +5728,14 @@
         SqueezeBertTokenizer,
     )
     from .models.stablelm import STABLELM_PRETRAINED_CONFIG_ARCHIVE_MAP, StableLmConfig
-    from .models.starcoder2 import STARCODER2_PRETRAINED_CONFIG_ARCHIVE_MAP, Starcoder2Config
-    from .models.superpoint import SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP, SuperPointConfig
+    from .models.starcoder2 import (
+        STARCODER2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Starcoder2Config,
+    )
+    from .models.superpoint import (
+        SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SuperPointConfig,
+    )
     from .models.swiftformer import (
         SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         SwiftFormerConfig,
@@ -5763,12 +5777,12 @@
         TvltFeatureExtractor,
         TvltProcessor,
     )
-    from .models.tvp import (
-        TVP_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        TvpConfig,
-        TvpProcessor,
+    from .models.tvp import TVP_PRETRAINED_CONFIG_ARCHIVE_MAP, TvpConfig, TvpProcessor
+    from .models.udop import (
+        UDOP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        UdopConfig,
+        UdopProcessor,
     )
-    from .models.udop import UDOP_PRETRAINED_CONFIG_ARCHIVE_MAP, UdopConfig, UdopProcessor
     from .models.umt5 import UMT5Config
     from .models.unispeech import (
         UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5792,10 +5806,7 @@
         ViltImageProcessor,
         ViltProcessor,
     )
-    from .models.vipllava import (
-        VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        VipLlavaConfig,
-    )
+    from .models.vipllava import VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, VipLlavaConfig
     from .models.vision_encoder_decoder import VisionEncoderDecoderConfig
     from .models.vision_text_dual_encoder import (
         VisionTextDualEncoderConfig,
@@ -6010,7 +6021,13 @@
     )
 
     # bitsandbytes config
-    from .utils.quantization_config import AqlmConfig, AwqConfig, BitsAndBytesConfig, GPTQConfig, QuantoConfig
+    from .utils.quantization_config import (
+        AqlmConfig,
+        AwqConfig,
+        BitsAndBytesConfig,
+        GPTQConfig,
+        QuantoConfig,
+    )
 
     try:
         if not is_sentencepiece_available():
@@ -6655,11 +6672,7 @@
             CodeGenModel,
             CodeGenPreTrainedModel,
         )
-        from .models.cohere import (
-            CohereForCausalLM,
-            CohereModel,
-            CoherePreTrainedModel,
-        )
+        from .models.cohere import CohereForCausalLM, CohereModel, CoherePreTrainedModel
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             ConditionalDetrForObjectDetection,
@@ -6737,12 +6750,7 @@
         )
 
         # PyTorch model imports
-        from .models.dbrx import (
-            DbrxBlock,
-            DbrxForCausalLM,
-            DbrxModel,
-            DbrxPreTrainedModel,
-        )
+        from .models.dbrx import DbrxForCausalLM, DbrxModel, DbrxPreTrainedModel
         from .models.deberta import (
             DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             DebertaForMaskedLM,
@@ -7032,10 +7040,7 @@
             FunnelPreTrainedModel,
             load_tf_weights_in_funnel,
         )
-        from .models.fuyu import (
-            FuyuForCausalLM,
-            FuyuPreTrainedModel,
-        )
+        from .models.fuyu import FuyuForCausalLM, FuyuPreTrainedModel
         from .models.gemma import (
             GemmaForCausalLM,
             GemmaForSequenceClassification,
@@ -8937,7 +8942,10 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_torchaudio_objects import *
     else:
-        from .models.musicgen_melody import MusicgenMelodyFeatureExtractor, MusicgenMelodyProcessor
+        from .models.musicgen_melody import (
+            MusicgenMelodyFeatureExtractor,
+            MusicgenMelodyProcessor,
+        )
     try:
         if not is_flax_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/dbrx/__init__.py b/src/transformers/models/dbrx/__init__.py
index 7660c376198e..9b1e325896bb 100644
--- a/src/transformers/models/dbrx/__init__.py
+++ b/src/transformers/models/dbrx/__init__.py
@@ -28,7 +28,6 @@
 else:
     _import_structure["modeling_dbrx"] = [
         "DbrxForCausalLM",
-        "DbrxBlock",
         "DbrxModel",
         "DbrxPreTrainedModel",
     ]
@@ -43,12 +42,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_dbrx import (
-            DbrxBlock,
-            DbrxForCausalLM,
-            DbrxModel,
-            DbrxPreTrainedModel,
-        )
+        from .modeling_dbrx import DbrxForCausalLM, DbrxModel, DbrxPreTrainedModel
 
 
 else:
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index dc2a2d2a4f60..68d1dca167f0 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2457,13 +2457,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class DbrxBlock(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class DbrxForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 

From 120df4045327e9a18a1aa5e22ae16ccabcbeefe7 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 14:18:02 +0000
Subject: [PATCH 010/131] add to configuration_auto

---
 src/transformers/models/auto/configuration_auto.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index bf46066002fe..d1aec920bee7 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -21,15 +21,18 @@
 from typing import List, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...dynamic_module_utils import (
+    get_class_from_dynamic_module,
+    resolve_trust_remote_code,
+)
 from ...utils import CONFIG_NAME, logging
 
-
 logger = logging.get_logger(__name__)
 
 
-from ..deprecated._archive_maps import CONFIG_ARCHIVE_MAP_MAPPING_NAMES  # noqa: F401, E402
-
+from ..deprecated._archive_maps import (  # noqa: F401, E402
+    CONFIG_ARCHIVE_MAP_MAPPING_NAMES,
+)
 
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
@@ -77,6 +80,7 @@
         ("data2vec-audio", "Data2VecAudioConfig"),
         ("data2vec-text", "Data2VecTextConfig"),
         ("data2vec-vision", "Data2VecVisionConfig"),
+        ("dbrx", "DbrxConfig"),
         ("deberta", "DebertaConfig"),
         ("deberta-v2", "DebertaV2Config"),
         ("decision_transformer", "DecisionTransformerConfig"),
@@ -334,6 +338,7 @@
         ("data2vec-audio", "Data2VecAudio"),
         ("data2vec-text", "Data2VecText"),
         ("data2vec-vision", "Data2VecVision"),
+        ("dbrx", "Dbrx"),
         ("deberta", "DeBERTa"),
         ("deberta-v2", "DeBERTa-v2"),
         ("decision_transformer", "Decision Transformer"),

From 56d841eeb49a6c00f316660d75ec35ea4c390fdd Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 14:26:19 +0000
Subject: [PATCH 011/131] docstring now passes formatting test

---
 src/transformers/models/dbrx/modeling_dbrx.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 401bb8393848..a9f2dda9e7e0 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -31,14 +31,10 @@
 from ...utils import is_flash_attn_2_available, logging
 from .configuration_dbrx import DbrxConfig
 
-
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (
-        index_first_axis,
-        pad_input,  # noqa
-        unpad_input,
-    )
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
 
 logger = logging.get_logger(__name__)
 
@@ -969,7 +965,9 @@ class DbrxModel(DbrxPreTrainedModel):
     [`DbrxBlock`] layers.
 
     Args:
-        config: DbrxConfig
+        config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
     """
 
     def __init__(self, config: DbrxConfig):

From 450ae2d4805ee87f829fb2f37083992e8c28e94a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 14:52:41 +0000
Subject: [PATCH 012/131] fix style

---
 src/transformers/models/auto/configuration_auto.py | 2 ++
 src/transformers/models/dbrx/modeling_dbrx.py      | 8 ++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index d1aec920bee7..c8fdbb74039a 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -27,6 +27,7 @@
 )
 from ...utils import CONFIG_NAME, logging
 
+
 logger = logging.get_logger(__name__)
 
 
@@ -34,6 +35,7 @@
     CONFIG_ARCHIVE_MAP_MAPPING_NAMES,
 )
 
+
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index a9f2dda9e7e0..82eb06785399 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -31,10 +31,14 @@
 from ...utils import is_flash_attn_2_available, logging
 from .configuration_dbrx import DbrxConfig
 
+
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import pad_input  # noqa
-    from flash_attn.bert_padding import index_first_axis, unpad_input
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,  # noqa
+        unpad_input,
+    )
 
 logger = logging.get_logger(__name__)
 

From cec735697caa375f6188c3de5a8ba9224b4541de Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 14:56:56 +0000
Subject: [PATCH 013/131] update READMEs

---
 README.md               | 3 ++-
 README_de.md            | 3 ++-
 README_es.md            | 9 +++++----
 README_fr.md            | 1 +
 README_hd.md            | 5 +++--
 README_ja.md            | 5 +++--
 README_ko.md            | 5 +++--
 README_pt-br.md         | 9 +++++----
 README_ru.md            | 9 +++++----
 README_te.md            | 7 ++++---
 README_vi.md            | 3 ++-
 README_zh-hans.md       | 9 +++++----
 README_zh-hant.md       | 7 ++++---
 docs/source/en/index.md | 1 +
 14 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index 4a3b78756716..21cfa5f20a22 100644
--- a/README.md
+++ b/README.md
@@ -331,7 +331,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere. 
+1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
@@ -341,6 +341,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_de.md b/README_de.md
index 5c3fa28ccba8..2461ded125eb 100644
--- a/README_de.md
+++ b/README_de.md
@@ -327,7 +327,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere. 
+1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
@@ -337,6 +337,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_es.md b/README_es.md
index 9a6ea777a790..26e612ea4c13 100644
--- a/README_es.md
+++ b/README_es.md
@@ -304,7 +304,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere. 
+1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
@@ -314,6 +314,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
@@ -443,7 +444,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 
+1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen1.5/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
@@ -471,9 +472,9 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. 
+1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
 1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich. 
+1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
 1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
diff --git a/README_fr.md b/README_fr.md
index 7f7fe2343e27..a71f6dd6e29c 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -335,6 +335,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (de Salesforce) publié dans l'article [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) par Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong et Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (de Microsoft) publié dans l'article [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) par Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (de Facebook) publié dans l'article [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) par Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (de Berkeley/Facebook/Google) publié dans l'article [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) par Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_hd.md b/README_hd.md
index 12df2d0740c9..1c3eab7004d5 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -288,6 +288,7 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (सेल्सफोर्स से) साथ में पेपर [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) नीतीश शिरीष केसकर*, ब्रायन मैककैन*, लव आर. वार्ष्णेय, कैमिंग जिओंग और रिचर्ड द्वारा सोचर द्वारा जारी किया गया।
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft से) साथ में दिया गया पेपर [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) हैपिंग वू, बिन जिओ, नोएल कोडेला, मेंगचेन लियू, जियांग दाई, लू युआन, लेई झांग द्वारा।
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (फेसबुक से) साथ में कागज [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) एलेक्सी बाएव्स्की, वेई-निंग सू, कियानटोंग जू, अरुण बाबू, जियाताओ गु, माइकल औली द्वारा पोस्ट किया गया।
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft से) साथ में दिया गया पेपर [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) पेंगचेंग हे, ज़ियाओडोंग लियू, जियानफेंग गाओ, वीज़ू चेन द्वारा।
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft से) साथ में दिया गया पेपर [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) पेंगचेंग हे, ज़ियाओडोंग लियू, जियानफेंग गाओ, वीज़ू चेन द्वारा पोस्ट किया गया।
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (बर्कले/फेसबुक/गूगल से) पेपर के साथ [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) लिली चेन, केविन लू, अरविंद राजेश्वरन, किमिन ली, आदित्य ग्रोवर, माइकल लास्किन, पीटर एबील, अरविंद श्रीनिवास, इगोर मोर्डच द्वारा पोस्ट किया गया।
@@ -417,7 +418,7 @@ conda install conda-forge::transformers
 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (माइक्रोसॉफ्ट रिसर्च से) साथ में पेपर [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) यू यान, वीज़ेन क्यूई, येयुन गोंग, दयाहेंग लियू, नान डुआन, जिउशेंग चेन, रुओफ़ेई झांग और मिंग झोउ द्वारा पोस्ट किया गया।
 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. से) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. द्वाराअनुसंधान पत्र [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) के साथ जारी किया गया
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc. से) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. द्वाराअनुसंधान पत्र [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) के साथ जारी किया गया 
+1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc. से) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. द्वाराअनुसंधान पत्र [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) के साथ जारी किया गया
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA से) साथ वाला पेपर [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) हाओ वू, पैट्रिक जुड, जिआओजी झांग, मिखाइल इसेव और पॉलियस माइकेविसियस द्वारा।
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (the Qwen team, Alibaba Group से) Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. द्वाराअनुसंधान पत्र [Qwen Technical Report](https://arxiv.org/abs/2309.16609) के साथ जारी किया गया
 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (the Qwen team, Alibaba Group से) Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. द्वाराअनुसंधान पत्र [blog post](https://qwenlm.github.io/blog/qwen1.5/) के साथ जारी किया गया
@@ -445,7 +446,7 @@ conda install conda-forge::transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (फेसबुक से) साथ में पेपर [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) चांगहान वांग, ऐनी वू, जुआन पिनो, एलेक्सी बेवस्की, माइकल औली, एलेक्सिस द्वारा Conneau द्वारा पोस्ट किया गया।
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (तेल अवीव यूनिवर्सिटी से) साथ में पेपर [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) ओरि राम, युवल कर्स्टन, जोनाथन बेरेंट, अमीर ग्लोबर्सन, ओमर लेवी द्वारा।
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (बर्कले से) कागज के साथ [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) फॉरेस्ट एन. इनडोला, अल्बर्ट ई. शॉ, रवि कृष्णा, और कर्ट डब्ल्यू. केटज़र द्वारा।
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. 
+1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
 1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
 1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
 1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI से) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. द्वाराअनुसंधान पत्र [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) के साथ जारी किया गया
diff --git a/README_ja.md b/README_ja.md
index 78cd7b0474be..ddaeee26738d 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -348,6 +348,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858)
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google から) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch から公開された研究論文: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
@@ -477,7 +478,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research から) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou から公開された研究論文: [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063)
 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. から) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. から公開された研究論文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc. から) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. から公開された研究論文 [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) 
+1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc. から) Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. から公開された研究論文 [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797)
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA から) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius から公開された研究論文: [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602)
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (the Qwen team, Alibaba Group から) Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. から公開された研究論文 [Qwen Technical Report](https://arxiv.org/abs/2309.16609)
 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (the Qwen team, Alibaba Group から) Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. から公開された研究論文 [blog post](https://qwenlm.github.io/blog/qwen1.5/)
@@ -505,7 +506,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook から), Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau から公開された研究論文: [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678)
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University から), Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy から公開された研究論文: [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438)
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley から) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer から公開された研究論文: [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316)
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. 
+1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
 1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
 1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
 1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI から) Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. から公開された研究論文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)
diff --git a/README_ko.md b/README_ko.md
index 1798760d86e9..ce6b1e1e7d75 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -263,6 +263,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce 에서) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 의 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 논문과 함께 발표했습니다.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft 에서) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 의 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 논문과 함께 발표했습니다.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook 에서) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 의 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 논문과 함께 발표했습니다.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google 에서) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 의 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 논문과 함께 발표했습니다.
@@ -392,7 +393,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (Microsoft Research 에서) Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 의 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 논문과 함께 발표했습니다.
 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (Nanjing University, The University of Hong Kong etc. 에서 제공)은 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.의 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf)논문과 함께 발표했습니다.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc. 에서 제공)은 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.의 [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797)논문과 함께 발표했습니다. 
+1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc. 에서 제공)은 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.의 [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797)논문과 함께 발표했습니다.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (NVIDIA 에서) Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 의 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 논문과 함께 발표했습니다.
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (the Qwen team, Alibaba Group 에서 제공)은 Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.의 [Qwen Technical Report](https://arxiv.org/abs/2309.16609)논문과 함께 발표했습니다.
 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (the Qwen team, Alibaba Group 에서 제공)은 Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.의 [blog post](https://qwenlm.github.io/blog/qwen1.5/)논문과 함께 발표했습니다.
@@ -420,7 +421,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (Facebook 에서) Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 의 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 논문과 함께 발표했습니다.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (Tel Aviv University 에서) Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 의 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 논문과 함께 발표했습니다.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (Berkeley 에서) Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 의 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 논문과 함께 발표했습니다.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. 
+1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
 1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
 1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
 1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (MBZUAI 에서 제공)은 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.의 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446)논문과 함께 발표했습니다.
diff --git a/README_pt-br.md b/README_pt-br.md
index 899acaf7f1c4..47ac5a587ce3 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -333,10 +333,10 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 
+1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere. 
+1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
@@ -346,6 +346,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
@@ -380,7 +381,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b) 
+1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
 1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (from Google) released with the paper [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team.
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
@@ -435,7 +436,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
+1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
diff --git a/README_ru.md b/README_ru.md
index fdb647996556..d00dc7a3d2e9 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -323,10 +323,10 @@ conda install conda-forge::transformers
 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 
+1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere. 
+1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
@@ -336,6 +336,7 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
@@ -424,8 +425,8 @@ conda install conda-forge::transformers
 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
-1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
+1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
+1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
diff --git a/README_te.md b/README_te.md
index 8906438d1fb0..475eb0323770 100644
--- a/README_te.md
+++ b/README_te.md
@@ -325,10 +325,10 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
-1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 
+1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere. 
+1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
@@ -338,6 +338,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
@@ -427,7 +428,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
-1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
+1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
diff --git a/README_vi.md b/README_vi.md
index 5aabe6ccc353..cee9203c5ad9 100644
--- a/README_vi.md
+++ b/README_vi.md
@@ -327,7 +327,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi
 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** được phát hành với bài báo [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (từ Salesforce) được phát hành với bài báo [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (từ MetaAI) được phát hành với bài báo [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (từ Cohere) được phát hành với bài báo [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere. 
+1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (từ Cohere) được phát hành với bài báo [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (từ Microsoft Research Asia) được phát hành với bài báo [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (từ YituTech) được phát hành với bài báo [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (từ Facebook AI) được phát hành với bài báo [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
@@ -337,6 +337,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (từ Salesforce) được phát hành với bài báo [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (từ Microsoft) được phát hành với bài báo [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (từ Facebook) được phát hành với bài báo [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (từ Microsoft) được phát hành với bài báo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (từ Microsoft) được phát hành với bài báo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (từ Berkeley/Facebook/Google) được phát hành với bài báo [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index ca3d42eb00b9..7ed2706e4ca9 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -277,7 +277,7 @@ conda install conda-forge::transformers
 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (来自 Salesforce) 伴随论文 [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) 由 Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong 发布。
 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (来自 MetaAI) 伴随论文 [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) 由 Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve 发布。
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (来自 Cohere) 伴随论文 [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) 由 Cohere 发布。 
+1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (来自 Cohere) 伴随论文 [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) 由 Cohere 发布。
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (来自 Microsoft Research Asia) 伴随论文 [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) 由 Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang 发布。
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
@@ -287,6 +287,7 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (来自 Berkeley/Facebook/Google) 伴随论文 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 由 Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 发布。
@@ -416,7 +417,7 @@ conda install conda-forge::transformers
 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (来自 Nanjing University, The University of Hong Kong etc.) 伴随论文 [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) 由 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao 发布。
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (来自 Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) 伴随论文 [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) 由 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao 发布。 
+1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (来自 Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) 伴随论文 [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) 由 Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao 发布。
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (来自 the Qwen team, Alibaba Group) 伴随论文 [Qwen Technical Report](https://arxiv.org/abs/2309.16609) 由 Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu 发布。
 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (来自 the Qwen team, Alibaba Group) 伴随论文 [blog post](https://qwenlm.github.io/blog/qwen1.5/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou 发布.
@@ -444,9 +445,9 @@ conda install conda-forge::transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. 
+1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
 1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
-1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich. 
+1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
 1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (来自 MBZUAI) 伴随论文 [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) 由 Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan 发布。
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (来自 Microsoft) 伴随论文 [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) 由 Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 78278a76a289..a485163da022 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -289,7 +289,7 @@ conda install conda-forge::transformers
 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker.
 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
-1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere. 
+1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale](<https://txt.cohere.com/command-r/>) by Cohere.
 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
@@ -299,6 +299,7 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
@@ -428,7 +429,7 @@ conda install conda-forge::transformers
 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
-1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 
+1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen1.5/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.
@@ -456,7 +457,7 @@ conda install conda-forge::transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)**  released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. 
+1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)**  released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by  Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu.
 1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries.
 1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
 1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index ffa9ae3f4b0b..613981b92b58 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -107,6 +107,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
 |                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
 |                   [Data2VecVision](model_doc/data2vec)                   |       ✅        |         ✅         |      ❌      |
+|                          [Dbrx](model_doc/dbrx)                          |       ✅        |         ❌         |      ❌      |
 |                       [DeBERTa](model_doc/deberta)                       |       ✅        |         ✅         |      ❌      |
 |                    [DeBERTa-v2](model_doc/deberta-v2)                    |       ✅        |         ✅         |      ❌      |
 |          [Decision Transformer](model_doc/decision_transformer)          |       ✅        |         ❌         |      ❌      |

From b5d4a6ead3f330befa52fd1d795ff0f802e8db20 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 15:17:47 +0000
Subject: [PATCH 014/131] add dbrx to modeling_auto

---
 src/transformers/models/auto/modeling_auto.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 150dea04f374..8841643a6053 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -75,6 +75,7 @@
         ("data2vec-audio", "Data2VecAudioModel"),
         ("data2vec-text", "Data2VecTextModel"),
         ("data2vec-vision", "Data2VecVisionModel"),
+        ("dbrx", "DbrxModel"),
         ("deberta", "DebertaModel"),
         ("deberta-v2", "DebertaV2Model"),
         ("decision_transformer", "DecisionTransformerModel"),
@@ -433,6 +434,7 @@
         ("cpmant", "CpmAntForCausalLM"),
         ("ctrl", "CTRLLMHeadModel"),
         ("data2vec-text", "Data2VecTextForCausalLM"),
+        ("dbrx", "DbrxForCausalLM"),
         ("electra", "ElectraForCausalLM"),
         ("ernie", "ErnieForCausalLM"),
         ("falcon", "FalconForCausalLM"),

From 3d9fd1635e266c5aa19d598facbd87943d34d0cf Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 15:58:48 +0000
Subject: [PATCH 015/131] make fix-copies generated this

---
 docs/source/en/tasks/language_modeling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index e1858ef24859..5ec9f6a62845 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -37,7 +37,7 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [Cohere](../model_doc/cohere), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [Gemma](../model_doc/gemma), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Mamba](../model_doc/mamba), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MusicGen Melody](../model_doc/musicgen_melody), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Qwen2](../model_doc/qwen2), [Qwen2MoE](../model_doc/qwen2_moe), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [StableLm](../model_doc/stablelm), [Starcoder2](../model_doc/starcoder2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [Cohere](../model_doc/cohere), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [Dbrx](../model_doc/dbrx), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [Gemma](../model_doc/gemma), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Mamba](../model_doc/mamba), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MusicGen Melody](../model_doc/musicgen_melody), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Qwen2](../model_doc/qwen2), [Qwen2MoE](../model_doc/qwen2_moe), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [StableLm](../model_doc/stablelm), [Starcoder2](../model_doc/starcoder2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
 
 
 

From 2bff6b9da14df845756f5f86821dd7fb2dc2b4d8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 15:59:11 +0000
Subject: [PATCH 016/131] add DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP

---
 src/transformers/__init__.py                        | 11 +++++++++--
 src/transformers/models/dbrx/__init__.py            |  4 ++--
 src/transformers/models/dbrx/configuration_dbrx.py  |  3 +++
 src/transformers/models/deprecated/_archive_maps.py |  3 +++
 src/transformers/utils/dummy_pt_objects.py          |  3 +++
 5 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b22bf2cf2fd4..a582df36fe80 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -327,7 +327,7 @@
         "Data2VecTextConfig",
         "Data2VecVisionConfig",
     ],
-    "models.dbrx": ["DbrxConfig"],
+    "models.dbrx": ["DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP", "DbrxConfig"],
     "models.deberta": [
         "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "DebertaConfig",
@@ -1937,6 +1937,7 @@
     )
     _import_structure["models.dbrx"].extend(
         [
+            "DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP",
             "DbrxForCausalLM",
             "DbrxModel",
             "DbrxPreTrainedModel",
@@ -5226,6 +5227,7 @@
         Data2VecTextConfig,
         Data2VecVisionConfig,
     )
+    from .models.dbrx import DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP, DbrxConfig
     from .models.deberta import (
         DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
         DebertaConfig,
@@ -6750,7 +6752,12 @@
         )
 
         # PyTorch model imports
-        from .models.dbrx import DbrxForCausalLM, DbrxModel, DbrxPreTrainedModel
+        from .models.dbrx import (
+            DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP,
+            DbrxForCausalLM,
+            DbrxModel,
+            DbrxPreTrainedModel,
+        )
         from .models.deberta import (
             DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             DebertaForMaskedLM,
diff --git a/src/transformers/models/dbrx/__init__.py b/src/transformers/models/dbrx/__init__.py
index 9b1e325896bb..75548996fb55 100644
--- a/src/transformers/models/dbrx/__init__.py
+++ b/src/transformers/models/dbrx/__init__.py
@@ -17,7 +17,7 @@
 
 
 _import_structure = {
-    "configuration_dbrx": ["DbrxConfig"],
+    "configuration_dbrx": ["DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP", "DbrxConfig"],
 }
 
 try:
@@ -34,7 +34,7 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_dbrx import DbrxConfig
+    from .configuration_dbrx import DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP, DbrxConfig
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 048526d4ac93..bbaa45547ba7 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -18,6 +18,9 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
+from ..deprecated._archive_maps import (  # noqa: F401, E402
+    DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP,
+)
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/deprecated/_archive_maps.py b/src/transformers/models/deprecated/_archive_maps.py
index f7b0679a3e4f..2e8a7fc67893 100644
--- a/src/transformers/models/deprecated/_archive_maps.py
+++ b/src/transformers/models/deprecated/_archive_maps.py
@@ -532,6 +532,8 @@ def __getitem__(self, item):
 
 DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST = DeprecatedList(["facebook/data2vec-vision-base-ft1k"])
 
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = DeprecatedDict({})
+
 DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = DeprecatedDict(
     {
         "microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/config.json",
@@ -2580,6 +2582,7 @@ def __getitem__(self, item):
         ("data2vec-audio", "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("data2vec-text", "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("data2vec-vision", "DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("dbrx", "DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 68d1dca167f0..f734a2faff23 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2457,6 +2457,9 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = None
+
+
 class DbrxForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 

From ea940a6b37d67f2efb9fe3afcb9efaacd865e0d3 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 16:09:03 +0000
Subject: [PATCH 017/131] config docstring passes formatting test

---
 src/transformers/models/dbrx/configuration_dbrx.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index bbaa45547ba7..8162f395ff3b 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -154,10 +154,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "
 
 
 class DbrxConfig(PretrainedConfig):
-    """Configuration class for Dbrx.
+    r"""
 
-    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
-    specified arguments, defining the model architecture.
+    This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+    specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.

From 990f196d5e26495f75f12bae209ea8b21bf37bff Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 16:49:07 +0000
Subject: [PATCH 018/131] rename moe_loss_weight to router_aux_loss_coef

---
 src/transformers/models/dbrx/configuration_dbrx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 8162f395ff3b..165e32f4f8e7 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -99,7 +99,7 @@ class DbrxFFNConfig(PretrainedConfig):
         moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
         moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
         moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
-        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+        router_aux_loss_coef (float, optional): The loss weight for the mixture of experts layer.
         moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
         uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
             This should only be used for benchmarking purposes.
@@ -112,7 +112,7 @@ def __init__(
         moe_num_experts: int = 4,
         moe_top_k: int = 1,
         moe_jitter_eps: Optional[float] = None,
-        moe_loss_weight: float = 0.01,
+        router_aux_loss_coef: float = 0.01,
         moe_normalize_expert_weights: Optional[float] = 1,
         uniform_expert_assignment: bool = False,
         **kwargs: Any,
@@ -125,7 +125,7 @@ def __init__(
         self.moe_num_experts = moe_num_experts
         self.moe_top_k = moe_top_k
         self.moe_jitter_eps = moe_jitter_eps
-        self.moe_loss_weight = moe_loss_weight
+        self.router_aux_loss_coef = router_aux_loss_coef
         self.moe_normalize_expert_weights = moe_normalize_expert_weights
         self.uniform_expert_assignment = uniform_expert_assignment
 

From 4a6f47ade86b85217e708bab2a0a1f347d2d55f9 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 16:59:18 +0000
Subject: [PATCH 019/131] add to flash-attn documentation

---
 docs/source/en/perf_infer_gpu_one.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 90409b1c21bc..011d38da558d 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -40,6 +40,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Bark](https://huggingface.co/docs/transformers/model_doc/bark#transformers.BarkModel)
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
+* [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
@@ -93,8 +94,8 @@ model_id = "tiiuae/falcon-7b"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 model = AutoModelForCausalLM.from_pretrained(
-    model_id, 
-    torch_dtype=torch.bfloat16, 
+    model_id,
+    torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
 )
 ```
@@ -106,7 +107,7 @@ FlashAttention-2 can only be used when the model's dtype is `fp16` or `bf16`. Ma
 <br>
 
 You can also set `use_flash_attention_2=True` to enable FlashAttention-2 but it is deprecated in favor of `attn_implementation="flash_attention_2"`.
-  
+
 </Tip>
 
 FlashAttention-2 can be combined with other optimization techniques like quantization to further speedup inference. For example, you can combine FlashAttention-2 with 8-bit or 4-bit quantization:
@@ -120,14 +121,14 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # load in 8bit
 model = AutoModelForCausalLM.from_pretrained(
-    model_id, 
+    model_id,
     load_in_8bit=True,
     attn_implementation="flash_attention_2",
 )
 
 # load in 4bit
 model = AutoModelForCausalLM.from_pretrained(
-    model_id, 
+    model_id,
     load_in_4bit=True,
     attn_implementation="flash_attention_2",
 )

From 9268388c5b8e9d68395230f823b422deadf34662 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Fri, 29 Mar 2024 17:51:04 +0000
Subject: [PATCH 020/131] fix model-path in tests

---
 docs/source/en/model_doc/dbrx.md              | 2 +-
 src/transformers/models/dbrx/modeling_dbrx.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index 54417551a164..fefdc8b91b4a 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -32,7 +32,7 @@ We used curriculum learning for pretraining, changing the data mix during traini
 More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
 
 
-This model was contributed by [abhi-db](<https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx).
+This model was contributed by [abhi-db](<https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct).
 
 ## DbrxConfig
 
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 82eb06785399..1abf81c6aa10 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1256,8 +1256,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, DbrxForCausalLM
 
-        >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx")
-        >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx")
+        >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct")
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")

From 54d98a405dc9a7aeef8365746927645be90e3d72 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Sun, 31 Mar 2024 10:45:36 -0400
Subject: [PATCH 021/131] Explicitly make `"suli"` the default `ffn_act_fn`

Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 src/transformers/models/dbrx/modeling_dbrx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 1abf81c6aa10..851dd3d049ca 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -744,7 +744,7 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int,
         self.v1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
         self.w2 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
 
-        act_fn_name = ffn_act_fn.pop("name")
+        act_fn_name = ffn_act_fn.pop("name", "silu")
         if len(ffn_act_fn) != 0:
             raise ValueError(f"FFN activation function has unhandled kwargs {ffn_act_fn=}")
         self.activation_fn = ACT2FN[act_fn_name]

From 370f57875ce50ef2842dbcded73c3a3ff86f9749 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 15:02:38 +0000
Subject: [PATCH 022/131] default to using router_aux_loss_coef over
 ffn_config[moe_loss_weight]

---
 src/transformers/models/dbrx/configuration_dbrx.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 165e32f4f8e7..fc5a917eafcd 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -125,7 +125,9 @@ def __init__(
         self.moe_num_experts = moe_num_experts
         self.moe_top_k = moe_top_k
         self.moe_jitter_eps = moe_jitter_eps
-        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_aux_loss_coef = (
+            router_aux_loss_coef if "moe_loss_weight" not in kwargs else kwargs["moe_loss_weight"]
+        )
         self.moe_normalize_expert_weights = moe_normalize_expert_weights
         self.uniform_expert_assignment = uniform_expert_assignment
 
@@ -245,6 +247,10 @@ def __init__(
         if ffn_config is None:
             self.ffn_config = DbrxFFNConfig()
         elif isinstance(ffn_config, dict):
+            # use router_aux_loss_coef over ffn_config["moe_loss_weight"]
+            if "moe_loss_weight" in ffn_config and "router_aux_loss_coef" not in ffn_config:
+                ffn_config["router_aux_loss_coef"] = ffn_config["moe_loss_weight"]
+                del ffn_config["moe_loss_weight"]
             self.ffn_config = DbrxFFNConfig(**ffn_config)
         else:
             self.ffn_config = ffn_config

From 7aba29f082c07f9c4c7f690966dc945477d0f4c3 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 15:15:29 +0000
Subject: [PATCH 023/131] fix _flash_attn_uses_top_left_mask and is_causal

---
 src/transformers/models/dbrx/modeling_dbrx.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 851dd3d049ca..155535687e66 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -28,7 +28,11 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...utils import is_flash_attn_2_available, logging
+from ...utils import (
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+)
 from .configuration_dbrx import DbrxConfig
 
 
@@ -246,6 +250,7 @@ def __init__(
         self.num_key_value_heads = attn_config.kv_n_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.rope_theta = attn_config.rope_theta
+        self.is_casual = True
 
         self.Wqkv = nn.Linear(
             self.hidden_size, self.hidden_size + 2 * self.num_key_value_heads * self.head_dim, bias=False
@@ -340,6 +345,12 @@ def __init__(self, *args: Any, **kwargs: Any):
 
         super().__init__(*args, **kwargs)
 
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        # From: https://github.com/huggingface/transformers/blob/3b8e2932ce743008f63585aae1e1b8b30dc8b3ac/src/transformers/models/gemma/modeling_gemma.py#L318
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def forward(
         self,
         hidden_states: torch.Tensor,

From 94756751eebfcb4acf2f2179f676a718354979bc Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 15:28:59 +0000
Subject: [PATCH 024/131] fix tests path

---
 templates/adding_a_new_model/README.md | 36 +++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/templates/adding_a_new_model/README.md b/templates/adding_a_new_model/README.md
index 9f3b9161fffd..52f481dcb3af 100644
--- a/templates/adding_a_new_model/README.md
+++ b/templates/adding_a_new_model/README.md
@@ -25,7 +25,7 @@ Jump to the [Add new model like section](#add-new-model-like-command) to learn h
 
 ## Cookiecutter Templates
 
-Using the `cookiecutter` utility requires to have all the `dev` dependencies installed. Let's first clone the 
+Using the `cookiecutter` utility requires to have all the `dev` dependencies installed. Let's first clone the
 repository and install it in our environment:
 
 ```shell script
@@ -53,20 +53,20 @@ This should launch the `cookiecutter` package which should prompt you to fill in
 The `modelname` should be cased according to the plain text casing, i.e., BERT, RoBERTa, DeBERTa.
 ```
 modelname [<ModelNAME>]:
-uppercase_modelname [<MODEL_NAME>]: 
-lowercase_modelname [<model_name>]: 
-camelcase_modelname [<ModelName>]: 
+uppercase_modelname [<MODEL_NAME>]:
+lowercase_modelname [<model_name>]:
+camelcase_modelname [<ModelName>]:
 ```
 
 Fill in the `authors` with your team members:
 ```
-authors [The HuggingFace Team]: 
+authors [The HuggingFace Team]:
 ```
 
 The checkpoint identifier is the checkpoint that will be used in the examples across the files. Put the name you wish,
 as it will appear on the modelhub. Do not forget to include the organisation.
 ```
-checkpoint_identifier [organisation/<model_name>-base-cased]: 
+checkpoint_identifier [organisation/<model_name>-base-cased]:
 ```
 
 The tokenizer should either be based on BERT if it behaves exactly like the BERT tokenizer, or a standalone otherwise.
@@ -74,19 +74,19 @@ The tokenizer should either be based on BERT if it behaves exactly like the BERT
 Select tokenizer_type:
 1 - Based on BERT
 2 - Standalone
-Choose from 1, 2 [1]: 
+Choose from 1, 2 [1]:
 ```
 <!---
 Choose if your model is an encoder-decoder, or an encoder-only architecture.
 
-If your model is an encoder-only architecture, the generated architecture will be based on the BERT model. 
+If your model is an encoder-only architecture, the generated architecture will be based on the BERT model.
 If your model is an encoder-decoder architecture, the generated architecture will be based on the BART model. You can,
 of course, edit the files once the generation is complete.
 ```
 Select is_encoder_decoder_model:
 1 - True
 2 - False
-Choose from 1, 2 [1]: 
+Choose from 1, 2 [1]:
 ```
 -->
 
@@ -97,8 +97,8 @@ src/transformers/models/<model_name>/configuration_<model_name>.py
 src/transformers/models/<model_name>/modeling_<model_name>.py
 src/transformers/models/<model_name>/modeling_tf_<model_name>.py
 src/transformers/models/<model_name>/tokenization_<model_name>.py
-tests/test_modeling_<model_name>.py
-tests/test_modeling_tf_<model_name>.py
+tests/models/<model_name>/test_modeling_<model_name>.py
+tests/models/<model_name>/test_modeling_tf_<model_name>.py
 ```
 
 You can run the tests to ensure that they all pass:
@@ -107,9 +107,9 @@ You can run the tests to ensure that they all pass:
 python -m pytest ./tests/test_*<model_name>*.py
 ```
 
-Feel free to modify each file to mimic the behavior of your model. 
+Feel free to modify each file to mimic the behavior of your model.
 
-⚠ You should be careful about the classes preceded by the following line:️ 
+⚠ You should be careful about the classes preceded by the following line:️
 
 ```python
 # Copied from transformers.[...]
@@ -119,8 +119,8 @@ This line ensures that the copy does not diverge from the source. If it *should*
 is different, this line needs to be deleted. If you don't delete this line and run `make fix-copies`,
 your changes will be overwritten.
 
-Once you have edited the files to fit your architecture, simply re-run the tests (and edit them if a change 
-is needed!) afterwards to make sure everything works as expected. 
+Once you have edited the files to fit your architecture, simply re-run the tests (and edit them if a change
+is needed!) afterwards to make sure everything works as expected.
 
 Once the files are generated and you are happy with your changes, here's a checklist to ensure that your contribution
 will be merged quickly:
@@ -251,7 +251,7 @@ Once you're done, you can run the tests to ensure that they all pass:
 python -m pytest ./tests/test_*<model_name>*.py
 ```
 
-⚠ You should be careful about the classes preceded by the following line:️ 
+⚠ You should be careful about the classes preceded by the following line:️
 
 ```python
 # Copied from transformers.[...]
@@ -261,8 +261,8 @@ This line ensures that the copy does not diverge from the source. If it *should*
 is different, this line needs to be deleted. If you don't delete this line and run `make fix-copies`,
 your changes will be overwritten.
 
-Once you have edited the files to fit your architecture, simply re-run the tests (and edit them if a change 
-is needed!) afterwards to make sure everything works as expected. 
+Once you have edited the files to fit your architecture, simply re-run the tests (and edit them if a change
+is needed!) afterwards to make sure everything works as expected.
 
 Once the files are generated and you are happy with your changes, here's a checklist to ensure that your contribution
 will be merged quickly:

From 3450fd1bf675bad8960a9d1f2f20d6353091939a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 16:08:25 +0000
Subject: [PATCH 025/131] don't use token type IDs

---
 tests/models/dbrx/test_modeling_dbrx.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index c5a6c5a9cbde..ff5396ba8397 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -21,16 +21,18 @@
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
 
 
 if is_torch_available():
     import torch
 
-    from transformers import (
-        DbrxForCausalLM,
-        DbrxModel,
-    )
+    from transformers import DbrxForCausalLM, DbrxModel
 
 
 class DbrxModelTester:
@@ -41,7 +43,7 @@ def __init__(
         seq_length=7,
         is_training=True,
         use_input_mask=True,
-        use_token_type_ids=True,
+        use_token_type_ids=False,
         use_labels=True,
         vocab_size=99,
         hidden_size=32,

From 46c95470b5d25d0ebd016d712c28dd9f92e28df4 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 16:40:04 +0000
Subject: [PATCH 026/131] follow Llama and remove token_type_ids from test

---
 tests/models/dbrx/test_modeling_dbrx.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index ff5396ba8397..80b398055be3 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -273,6 +273,7 @@ def create_and_check_decoder_model_past_large_inputs(
         # test that outputs are equal for slice
         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Dbrx
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -284,7 +285,7 @@ def prepare_config_and_inputs_for_common(self):
             token_labels,
             choice_labels,
         ) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
         return config, inputs_dict
 
 

From 0ed3675314b43c6aa1be31a5be752639360ef0fe Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 17:09:03 +0000
Subject: [PATCH 027/131] init ConfigTester differently so tests pass

---
 tests/models/dbrx/test_modeling_dbrx.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 80b398055be3..72cdb1363e8f 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -303,7 +303,9 @@ class DbrxModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = DbrxModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DbrxConfig, hidden_size=37)
+        self.config_tester = ConfigTester(
+            self, config_class=DbrxConfig, ffn_config={"ffn_hidden_size": 37, "model_type": ""}
+        )
 
     def test_config(self):
         self.config_tester.run_common_tests()

From a08b27dcf4accfa5eb6f41b4674cf700d48d10ad Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 17:14:30 +0000
Subject: [PATCH 028/131] remove multiple choice test

---
 tests/models/dbrx/test_modeling_dbrx.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 72cdb1363e8f..e3eb2362d696 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -324,10 +324,6 @@ def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
 
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
     def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)

From c98d9f22d52adca0716ce91d016abd0dddfac385 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 17:15:01 +0000
Subject: [PATCH 029/131] remove question + answer test

---
 tests/models/dbrx/test_modeling_dbrx.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index e3eb2362d696..4c878a632d7e 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -328,10 +328,6 @@ def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
     def test_for_sequence_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)

From 598c9a01cb49730f9b7994edb8dd2b9702a17fe6 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 17:15:24 +0000
Subject: [PATCH 030/131] remove sequence classification test

---
 tests/models/dbrx/test_modeling_dbrx.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 4c878a632d7e..532a907cf3d9 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -328,10 +328,6 @@ def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
     def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_token_classification(*config_and_inputs)

From c73c590961a71ae0e40000b22fc485abbb837ce2 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 17:15:41 +0000
Subject: [PATCH 031/131] remove token classification test

---
 tests/models/dbrx/test_modeling_dbrx.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 532a907cf3d9..c413e8f0e96b 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -328,10 +328,6 @@ def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
 
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
     def test_model_as_decoder(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)

From e58f1b21b987a741cbdc95c3b4ea611527fcdae4 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 18:21:48 +0000
Subject: [PATCH 032/131] copy Llama tests and remove token_type_ids from test
 inputs

---
 tests/models/dbrx/test_modeling_dbrx.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index c413e8f0e96b..7094555c5d44 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -28,7 +28,6 @@
     random_attention_mask,
 )
 
-
 if is_torch_available():
     import torch
 
@@ -150,17 +149,18 @@ def prepare_config_and_inputs_for_decoder(self):
             encoder_attention_mask,
         )
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Dbrx
     def create_and_check_model(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
         model = DbrxModel(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids, attention_mask=input_mask)
         result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Dbrx
     def create_and_check_model_as_decoder(
         self,
         config,
@@ -180,19 +180,19 @@ def create_and_check_model_as_decoder(
         result = model(
             input_ids,
             attention_mask=input_mask,
-            token_type_ids=token_type_ids,
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
         )
         result = model(
             input_ids,
             attention_mask=input_mask,
-            token_type_ids=token_type_ids,
             encoder_hidden_states=encoder_hidden_states,
         )
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, attention_mask=input_mask)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Dbrx
     def create_and_check_for_causal_lm(
         self,
         config,
@@ -208,7 +208,7 @@ def create_and_check_for_causal_lm(
         model = DbrxForCausalLM(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
     def create_and_check_decoder_model_past_large_inputs(

From 32ceb8741b5179e2d9129d843902d670995c6679 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 18:52:44 +0000
Subject: [PATCH 033/131] do not test pruning or headmasking; style code

---
 tests/models/dbrx/test_modeling_dbrx.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 7094555c5d44..14997658c069 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -191,7 +191,6 @@ def create_and_check_model_as_decoder(
         result = model(input_ids, attention_mask=input_mask)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Dbrx
     def create_and_check_for_causal_lm(
         self,
@@ -291,15 +290,10 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class DbrxModelTest(ModelTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            DbrxModel,
-            DbrxForCausalLM,
-        )
-        if is_torch_available()
-        else ()
-    )
+    all_model_classes = (DbrxModel, DbrxForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (DbrxForCausalLM,) if is_torch_available() else ()
+    test_headmasking = False
+    test_pruning = False
 
     def setUp(self):
         self.model_tester = DbrxModelTester(self)

From daabaeceefa0e9601d7b1c3a47a803a69d79537d Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 19:24:07 +0000
Subject: [PATCH 034/131] add _tied_weights_keys parameter to pass test

---
 src/transformers/models/dbrx/modeling_dbrx.py | 2 ++
 tests/models/dbrx/test_modeling_dbrx.py       | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 155535687e66..8180db32d443 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1216,6 +1216,8 @@ def _update_causal_mask(
 
 
 class DbrxForCausalLM(DbrxPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
     def __init__(self, config: DbrxConfig):
         super().__init__(config)
         self.transformer = DbrxModel(config)
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 14997658c069..60cedac8221d 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -28,6 +28,7 @@
     random_attention_mask,
 )
 
+
 if is_torch_available():
     import torch
 

From dabcca039344c61010135963f0b640d028d6809e Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 20:38:01 +0000
Subject: [PATCH 035/131] add type hints

---
 src/transformers/models/dbrx/modeling_dbrx.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 8180db32d443..07e5a256ae42 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -450,7 +450,14 @@ def forward(
 
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        query_length: int,
+        dropout: float = 0.0,
+        softmax_scale: optional[float] = None,
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -466,6 +473,7 @@ def _flash_attention_forward(
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
+            query_length (`int`): The length of the query sequence.
             dropout (`float`):
                 Attention dropout
             softmax_scale (`float`, *optional*):
@@ -509,7 +517,14 @@ def _flash_attention_forward(
         return attn_output
 
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+    def _upad_input(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: torch.Tensor,
+        query_length: int
+        ):
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 

From 191ec1e0e50c41662bee3a950f7e506dc3bdc5ee Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 20:43:30 +0000
Subject: [PATCH 036/131] fix type check

---
 src/transformers/models/dbrx/modeling_dbrx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 07e5a256ae42..c85ad168289c 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -457,7 +457,7 @@ def _flash_attention_forward(
         attention_mask: torch.Tensor,
         query_length: int,
         dropout: float = 0.0,
-        softmax_scale: optional[float] = None,
+        softmax_scale: Optional[float] = None,
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -523,8 +523,8 @@ def _upad_input(
         key_layer: torch.Tensor,
         value_layer: torch.Tensor,
         attention_mask: torch.Tensor,
-        query_length: int
-        ):
+        query_length: int,
+    ):
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 

From 3dad3bddf09ec302d635fe9c2ae996893c3bc55b Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 20:43:46 +0000
Subject: [PATCH 037/131] update config tester

---
 tests/models/dbrx/test_modeling_dbrx.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 60cedac8221d..f40406737548 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -298,9 +298,7 @@ class DbrxModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = DbrxModelTester(self)
-        self.config_tester = ConfigTester(
-            self, config_class=DbrxConfig, ffn_config={"ffn_hidden_size": 37, "model_type": ""}
-        )
+        self.config_tester = ConfigTester(self, config_class=DbrxConfig, d_model=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()

From 5c837c94684d06bb995089348e0d4800f2bfaabe Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 20:47:44 +0000
Subject: [PATCH 038/131] remove masked_lm test

---
 tests/models/dbrx/test_modeling_dbrx.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index f40406737548..d91aca6e7d51 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -313,10 +313,6 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
-
     def test_decoder_model_past_with_large_inputs(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)

From 58a4f15f8e37e3920815a0f96d107a5d5aff7127 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Sun, 31 Mar 2024 22:16:57 +0000
Subject: [PATCH 039/131] remove encoder tests

---
 tests/models/dbrx/test_modeling_dbrx.py | 70 +------------------------
 1 file changed, 1 insertion(+), 69 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index d91aca6e7d51..15fe9d63456d 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -21,12 +21,7 @@
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import (
-    ModelTesterMixin,
-    floats_tensor,
-    ids_tensor,
-    random_attention_mask,
-)
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
 
 if is_torch_available():
@@ -123,33 +118,6 @@ def get_config(self):
             initializer_range=self.initializer_range,
         )
 
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Dbrx
     def create_and_check_model(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
@@ -313,42 +281,6 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "databricks/dbrx-instruct"

From 60662cb79469cf3b6d67bdf54c36fccf3d729e01 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 01:15:13 +0000
Subject: [PATCH 040/131] initialize DbrxModelTester with correct params

---
 tests/models/dbrx/test_modeling_dbrx.py | 126 +++++++++++++++++-------
 1 file changed, 92 insertions(+), 34 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 15fe9d63456d..26569c01f7ff 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -23,7 +23,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
-
 if is_torch_available():
     import torch
 
@@ -34,50 +33,102 @@ class DbrxModelTester:
     def __init__(
         self,
         parent,
+        hidden_size=32,
+        ffn_hidden_size=32,
+        num_attention_heads=4,
+        kv_n_heads=4,
+        num_hidden_layers=5,
+        max_position_embeddings=512,
+        type_vocab_size=16,
         batch_size=13,
         seq_length=7,
         is_training=True,
         use_input_mask=True,
         use_token_type_ids=False,
         use_labels=True,
-        vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=5,
-        num_attention_heads=4,
-        intermediate_size=37,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=16,
+        use_cache=True,
         type_sequence_label_size=2,
-        initializer_range=0.02,
         num_labels=3,
         num_choices=4,
         scope=None,
+        clip_qkv=8,
+        rope_theta=500000,
+        attn_config_model_type="",
+        emb_pdrop=0.0,
+        moe_jitter_eps=0,
+        moe_loss_weight=0.05,
+        moe_num_experts=16,
+        moe_top_k=4,
+        fnn_config_model_type="",
+        ffn_act_fn_name="gelu",
+        initializer_range=0.02,
+        output_router_logits=False,
+        resid_pdrop=0.0,
+        router_aux_loss_coef=0.05,
+        tie_word_embeddings=False,
+        torch_dtype=torch.bfloat16,
+        vocab_size=99,
     ):
-        self.parent = parent
+        # Parameters unique to testing
         self.batch_size = batch_size
         self.seq_length = seq_length
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+        self.parent = parent
         self.is_training = is_training
         self.use_input_mask = use_input_mask
         self.use_token_type_ids = use_token_type_ids
         self.use_labels = use_labels
-        self.vocab_size = vocab_size
+
+        # attn_config params
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+        self.attn_config_model_type = attn_config_model_type
+
+        # fnn_config params
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.fnn_config_model_type = fnn_config_model_type
+        self.ffn_act_fn_name = ffn_act_fn_name
+
+        # Other params
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.type_sequence_label_size = type_sequence_label_size
+        self.vocab_size = vocab_size
+        self.use_cache = use_cache
         self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.num_choices = num_choices
-        self.scope = scope
+        self.emb_pdrop = emb_pdrop
+        self.output_router_logits = output_router_logits
+        self.resid_pdrop = resid_pdrop
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.tie_word_embeddings = tie_word_embeddings
+        self.torch_dtype = torch_dtype
+
+        # Make the dictionaries
+        self.ffn_config = {
+            "ffn_hidden_size": self.ffn_hidden_size,
+            "moe_jitter_eps": self.moe_jitter_eps,
+            "moe_loss_weight": self.moe_loss_weight,
+            "moe_num_experts": self.moe_num_experts,
+            "moe_top_k": self.moe_top_k,
+            "model_type": self.fnn_config_model_type,
+            "ffn_act_fn": {"name": self.ffn_act_fn_name},
+        }
+        self.attn_config = {
+            "clip_qkv": self.clip_qkv,
+            "kv_n_heads": self.kv_n_heads,
+            "model_type": self.attn_config_model_type,
+            "rope_theta": self.rope_theta,
+        }
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -103,20 +154,27 @@ def prepare_config_and_inputs(self):
         return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
     def get_config(self):
-        return DbrxConfig(
+        # Behind the scenes, `DbrxConfig` maps the parameters `hidden_size`, `num_hidden_layers`,
+        # `num_attention_heads`, `max_position_embeddings` to the parameters `d_model`, `n_layers`,
+        # `n_heads`, `max_seq_len` respectively. We use the first group of parameters because
+        # other tests expect every model to have these parameters with these specific names.
+        config = DbrxConfig(
             vocab_size=self.vocab_size,
-            hidden_size=self.hidden_size,
-            num_hidden_layers=self.num_hidden_layers,
-            num_attention_heads=self.num_attention_heads,
-            intermediate_size=self.intermediate_size,
-            hidden_act=self.hidden_act,
-            hidden_dropout_prob=self.hidden_dropout_prob,
-            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-            max_position_embeddings=self.max_position_embeddings,
-            type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
+            hidden_size=self.hidden_size,  # mapped to `d_model`
+            num_hidden_layers=self.num_hidden_layers,  # mapped to `n_layers`
+            num_attention_heads=self.num_attention_heads,  # mapped to `n_heads`
+            max_position_embeddings=self.max_position_embeddings,  # mapped to `max_seq_len`
+            attn_config=self.attn_config,
+            ffn_config=self.ffn_config,
+            resid_pdrop=self.resid_pdrop,
+            emb_pdrop=self.emb_pdrop,
+            use_cache=self.use_cache,
             initializer_range=self.initializer_range,
+            output_router_logits=self.output_router_logits,
+            router_aux_loss_coef=self.router_aux_loss_coef,
+            is_decoder=False,
         )
+        return config
 
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Dbrx
     def create_and_check_model(

From e82992265d41bfbdecf35fdb21380e15a7ee42bf Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 01:15:46 +0000
Subject: [PATCH 041/131] style

---
 tests/models/dbrx/test_modeling_dbrx.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 26569c01f7ff..ff6524ba6699 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -23,6 +23,7 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
+
 if is_torch_available():
     import torch
 

From 7cca86a1a4f6553dbf8315d7b8f20ea3a4de5834 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 01:59:33 +0000
Subject: [PATCH 042/131] torch_dtype does not rely on torch

---
 tests/models/dbrx/test_modeling_dbrx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index ff6524ba6699..266c94d9bff2 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -67,7 +67,7 @@ def __init__(
         resid_pdrop=0.0,
         router_aux_loss_coef=0.05,
         tie_word_embeddings=False,
-        torch_dtype=torch.bfloat16,
+        torch_dtype="bfloat16",
         vocab_size=99,
     ):
         # Parameters unique to testing

From 1e21729bd3c056f017b4b1700948aaf543267b6b Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 02:39:51 +0000
Subject: [PATCH 043/131] run make fixup, fix-copies

---
 src/transformers/models/dbrx/modeling_dbrx.py | 30 ++++++++-----------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index c85ad168289c..0b3824ab7a96 100755
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -450,14 +450,7 @@ def forward(
 
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
-        self,
-        query_states: torch.Tensor,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        query_length: int,
-        dropout: float = 0.0,
-        softmax_scale: Optional[float] = None,
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -473,7 +466,6 @@ def _flash_attention_forward(
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
-            query_length (`int`): The length of the query sequence.
             dropout (`float`):
                 Attention dropout
             softmax_scale (`float`, *optional*):
@@ -517,14 +509,7 @@ def _flash_attention_forward(
         return attn_output
 
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
-    def _upad_input(
-        self,
-        query_layer: torch.Tensor,
-        key_layer: torch.Tensor,
-        value_layer: torch.Tensor,
-        attention_mask: torch.Tensor,
-        query_length: int,
-    ):
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
@@ -1113,6 +1098,17 @@ def forward(
                     use_cache=use_cache,
                     cache_position=cache_position,
                 )
+                # block_outputs = self._gradient_checkpointing_func(
+                #     block.__call__,
+                #     hidden_states,
+                #     causal_mask,
+                #     position_ids,
+                #     past_key_values,
+                #     output_attentions,
+                #     output_router_logits,
+                #     use_cache,
+                #     cache_position,
+                # )
             else:
                 block_outputs = block(
                     hidden_states,

From 7e4b7fd3b6d9964b58e30aad365768071f683d9b Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 03:21:56 +0000
Subject: [PATCH 044/131] use
 https://huggingface.co/v2ray/dbrx-base-fixed/blob/main/modeling_dbrx.py

---
 src/transformers/models/dbrx/modeling_dbrx.py | 453 ++++++++----------
 1 file changed, 190 insertions(+), 263 deletions(-)
 mode change 100755 => 100644 src/transformers/models/dbrx/modeling_dbrx.py

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
old mode 100755
new mode 100644
index 0b3824ab7a96..f7711c0e6c40
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1,71 +1,65 @@
-# coding=utf-8
-# Copyright 2022 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch DBRX model. """
+"""PyTorch Dbrx model."""
 
 import math
 import warnings
-from typing import Any, Dict, Optional, Tuple, Union
+from copy import deepcopy
+from functools import partial
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 
-from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache, StaticCache
-from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import (
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
 )
-from .configuration_dbrx import DbrxConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import is_flash_attn_2_available, logging
+
+from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
 
 
 if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (
-        index_first_axis,
-        pad_input,  # noqa
-        unpad_input,
-    )
+    try:
+        from flash_attn import flash_attn_func, flash_attn_varlen_func
+        from flash_attn.bert_padding import (
+            index_first_axis,
+            pad_input,  # noqa
+            unpad_input,
+        )
+    except ImportError:
+        pass
 
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "DbrxConfig"
 
+#############################################################################
+# Copied from LLaMaRotaryEmbedding
+#############################################################################
+
 
-# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with Gemma->Dbrx
 class DbrxRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+    def __init__(
+        self, dim: int, max_position_embeddings: int = 2048, base: float = 10000.0, scaling_factor: float = 1.0
+    ):
         super().__init__()
-
+        self.scaling_factor = scaling_factor
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        self.register_buffer("inv_freq", None, persistent=False)
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # For BC we register cos and sin cached
+        self.max_seq_len_cached = max_position_embeddings
 
     @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
+    def forward(self, x: torch.Tensor, position_ids: torch.LongTensor) -> Tuple[torch.Tensor, torch.Tensor]:
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if self.inv_freq is None:
-            self.inv_freq = 1.0 / (
-                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
-            )
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
@@ -80,16 +74,16 @@ def forward(self, x, position_ids, seq_len=None):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+def apply_rotary_pos_emb(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, unsqueeze_dim: int = 1
+) -> Tuple[torch.Tensor, torch.Tensor]:
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -97,15 +91,14 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos and
+            sin so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos and sin have the shape [batch_size, seq_len, head_dim]. Then, if q and
             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            cos and sin broadcastable to the shapes of q and k. Similarly, if q and k have
             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
@@ -116,11 +109,11 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
 
 
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """Equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
+
+    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
+    (batch, num_attention_heads, seqlen, head_dim)
     """
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
@@ -129,6 +122,13 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
+#############################################################################
+
+#############################################################################
+# Modified from modeling_mixtral
+#############################################################################
+
+
 def load_balancing_loss_func(
     gate_logits: torch.Tensor,
     num_experts: int,
@@ -209,8 +209,34 @@ def load_balancing_loss_func(
     return overall_loss * num_experts
 
 
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
+#############################################################################
+
+
+def resolve_ffn_act_fn(ffn_act_fn: dict) -> Callable[[torch.Tensor], torch.Tensor]:
+    """Resolve the activation function for the feed-forward network.
+
+    Args:
+        ffn_act_fn (dict): The configuration dictionary for the activation function.
+            The dict config must specify the 'name' of a torch.nn.functional activation
+            function. All of other key values pairs are bound to the function as a partial.
+
+    Returns:
+        Callable[[torch.Tensor], torch.Tensor]: The activation function.
+    """
+    config = deepcopy(ffn_act_fn)
+    name = config.pop("name")
+    if not hasattr(nn.functional, name):
+        raise ValueError(f"Unrecognised activation function name ({name}).")
+    act = getattr(nn.functional, name)
+    return partial(act, **config)
+
+
+#############################################################################
+# Copied from LLaMaAttention
+#############################################################################
+
+
+def _get_unpad_data(attention_mask: torch.Tensor):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
@@ -227,16 +253,19 @@ class DbrxAttention(nn.Module):
 
     def __init__(
         self,
-        config: DbrxConfig,
+        hidden_size: int,
+        num_heads: int,
+        max_position_embeddings: int,
+        attn_config: DbrxAttentionConfig,
         block_idx: Optional[int] = None,
     ):
         super().__init__()
-        self.config = config
-        self.hidden_size = config.d_model
-        self.num_heads = config.n_heads
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
         self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_seq_len
+        self.max_position_embeddings = max_position_embeddings
         self.block_idx = block_idx
+        self.config = attn_config
         if block_idx is None:
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `block_idx` is not recommended and will "
@@ -244,13 +273,11 @@ def __init__(
                 + "when creating this class."
             )
 
-        attn_config = config.attn_config
         self.attn_pdrop = attn_config.attn_pdrop
         self.clip_qkv = attn_config.clip_qkv
         self.num_key_value_heads = attn_config.kv_n_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.rope_theta = attn_config.rope_theta
-        self.is_casual = True
 
         self.Wqkv = nn.Linear(
             self.hidden_size, self.hidden_size + 2 * self.num_key_value_heads * self.head_dim, bias=False
@@ -345,12 +372,6 @@ def __init__(self, *args: Any, **kwargs: Any):
 
         super().__init__(*args, **kwargs)
 
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        # From: https://github.com/huggingface/transformers/blob/3b8e2932ce743008f63585aae1e1b8b30dc8b3ac/src/transformers/models/gemma/modeling_gemma.py#L318
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -448,35 +469,31 @@ def forward(
 
         return attn_output, attn_weights, past_key_value  # type: ignore
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        attention_mask: Union[torch.LongTensor, None],
+        query_length: int,
+        dropout: float = 0.0,
+        softmax_scale: Optional[float] = None,
     ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
+        """Use FlashAttention, stripping padding tokens if necessary.
 
         Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            query_states (torch.Tensor): Input query states to be passed to Flash Attention API
+            key_states (torch.Tensor): Input key states to be passed to Flash Attention API
+            value_states (torch.Tensor): Input value states to be passed to Flash Attention API
+            attention_mask (torch.LongTensor | None): The padding mask - corresponds to a tensor of size
+                (batch_size, seq_len) where 0 stands for the position of padding tokens and 1
+                for the position of non-padding tokens.
+            query_length (int): The length of the query sequence
+            dropout (float): Attention dropout
+            softmax_scale (float, optional): The scaling of QK^T before applying softmax.
+                Defaults to 1 / sqrt(head_dim)
         """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
+        causal = True
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -500,16 +517,32 @@ def _flash_attention_forward(
                 causal=causal,
             )
 
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+            attn_output = pad_input(
+                attn_output_unpad,
+                indices_q,
+                batch_size,
+                query_length,
+            )
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+                query_states,
+                key_states,
+                value_states,
+                dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
             )
 
         return attn_output
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+    def _upad_input(
+        self,
+        query_layer: torch.Tensor,
+        key_layer: torch.Tensor,
+        value_layer: torch.Tensor,
+        attention_mask: torch.Tensor,
+        query_length: int,
+    ):
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
@@ -548,121 +581,35 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         )
 
 
-class DbrxSdpaAttention(DbrxAttention):
-    """
-    Dbrx attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `DbrxAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Ignore copy
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "GemmaModel is using GemmaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        qkv_states = self.Wqkv(hidden_states)
-        if self.clip_qkv is not None:
-            qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
-
-        query_states, key_states, value_states = qkv_states.split(
-            [
-                self.hidden_size,
-                self.num_key_value_heads * self.head_dim,
-                self.num_key_value_heads * self.head_dim,
-            ],
-            dim=2,
-        )
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
-
-        past_key_value = getattr(self, "past_key_value", past_key_value)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        causal_mask = attention_mask
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and causal_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attn_pdrop if self.training else 0.0,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, -1)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
 DBRX_ATTENTION_CLASSES = {
     "eager": DbrxAttention,
     "flash_attention_2": DbrxFlashAttention2,
-    "sdpa": DbrxSdpaAttention,
 }
 
 
 class DbrxNormAttentionNorm(nn.Module):
     def __init__(
         self,
-        config: DbrxConfig,
+        hidden_size: int,
+        num_heads: int,
+        max_position_embeddings: int,
+        resid_pdrop: float,
+        attn_implementation: str,
+        attn_config: DbrxAttentionConfig,
         block_idx: Optional[int] = None,
     ):
         super().__init__()
         self.block_idx = block_idx
-        self.resid_pdrop = config.resid_pdrop
-        self.norm_1 = nn.LayerNorm(config.d_model, bias=False)
-        self.attn = DBRX_ATTENTION_CLASSES[config._attn_implementation](
-            config=config,
+        self.resid_pdrop = resid_pdrop
+        self.norm_1 = nn.LayerNorm(hidden_size, bias=False)
+        self.attn = DBRX_ATTENTION_CLASSES[attn_implementation](
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            max_position_embeddings=max_position_embeddings,
+            attn_config=attn_config,
             block_idx=block_idx,
         )
-        self.norm_2 = nn.LayerNorm(config.d_model, bias=False)
+        self.norm_2 = nn.LayerNorm(hidden_size, bias=False)
 
     def forward(
         self,
@@ -718,9 +665,17 @@ def __init__(
 
         self.layer = nn.Linear(self.hidden_size, self.moe_num_experts, bias=False)
 
+    def jitter(self, x: torch.Tensor) -> torch.Tensor:
+        if self.moe_jitter_eps is None:
+            raise RuntimeError("The router does not have moe_jitter_eps set.")
+        low = 1.0 - self.moe_jitter_eps
+        high = 1.0 + self.moe_jitter_eps
+        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
+        return low + noise * (high - low)
+
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
         if self.training and self.moe_jitter_eps is not None:
-            x *= torch.empty_like(x).uniform_(1.0 - self.moe_jitter_eps, 1.0 + self.moe_jitter_eps)
+            x = x * self.jitter(x)
 
         weights = self.layer(x.view(-1, x.shape[-1])).softmax(dim=-1, dtype=torch.float32)
         top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
@@ -745,43 +700,28 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Lo
 
 
 class DbrxExpertGLU(nn.Module):
-    def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict):
+    def __init__(self, hidden_size: int, ffn_hidden_size: int, ffn_act_fn: dict):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.ffn_hidden_size = ffn_hidden_size
-        self.moe_num_experts = moe_num_experts
-
-        self.w1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
-        self.v1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
-        self.w2 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.w1 = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
+        self.v1 = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
+        self.w2 = nn.Linear(ffn_hidden_size, hidden_size, bias=False)
+        self.activation_fn = resolve_ffn_act_fn(ffn_act_fn)
 
-        act_fn_name = ffn_act_fn.pop("name", "silu")
-        if len(ffn_act_fn) != 0:
-            raise ValueError(f"FFN activation function has unhandled kwargs {ffn_act_fn=}")
-        self.activation_fn = ACT2FN[act_fn_name]
-
-    def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
-        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-
-        gate_proj = x.matmul(expert_w1.t())
-        up_proj = x.matmul(expert_v1.t())
-        gate_proj = self.activation_fn(gate_proj)
-        intermediate_states = gate_proj * up_proj
-        down_proj = intermediate_states.matmul(expert_w2)
-        return down_proj
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x1 = self.w1(x)
+        x2 = self.v1(x)
+        x1 = self.activation_fn(x1)
+        x1 = x1 * x2
+        x1 = self.w2(x1)
+        return x1
 
 
 class DbrxExperts(nn.Module):
     def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict):
         super().__init__()
         self.moe_num_experts = moe_num_experts
-        self.mlp = DbrxExpertGLU(
-            hidden_size=hidden_size,
-            ffn_hidden_size=ffn_hidden_size,
-            moe_num_experts=moe_num_experts,
-            ffn_act_fn=ffn_act_fn,
+        self.mlp_experts = nn.ModuleList(
+            [DbrxExpertGLU(hidden_size, ffn_hidden_size, ffn_act_fn) for _ in range(moe_num_experts)]
         )
 
     def forward(
@@ -801,7 +741,7 @@ def forward(
             topk_list = topk_idx.tolist()
 
             expert_tokens = x[None, token_list].reshape(-1, hidden_size)
-            expert_out = self.mlp(expert_tokens, expert_idx) * top_weights[token_list, topk_list, None]
+            expert_out = self.mlp_experts[expert_idx](expert_tokens) * top_weights[token_list, topk_list, None]
 
             out.index_add_(0, token_idx, expert_out)
 
@@ -810,12 +750,11 @@ def forward(
 
 
 class DbrxFFN(nn.Module):
-    def __init__(self, config: DbrxConfig):
+    def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig):
         super().__init__()
 
-        ffn_config = config.ffn_config
         self.router = DbrxRouter(
-            hidden_size=config.d_model,
+            hidden_size,
             moe_num_experts=ffn_config.moe_num_experts,
             moe_top_k=ffn_config.moe_top_k,
             moe_jitter_eps=ffn_config.moe_jitter_eps,
@@ -824,7 +763,7 @@ def __init__(self, config: DbrxConfig):
         )
 
         self.experts = DbrxExperts(
-            hidden_size=config.d_model,
+            hidden_size=hidden_size,
             ffn_hidden_size=ffn_config.ffn_hidden_size,
             moe_num_experts=ffn_config.moe_num_experts,
             ffn_act_fn=ffn_config.ffn_act_fn,
@@ -843,16 +782,21 @@ def __init__(self, config: DbrxConfig, block_idx: int):
         self.resid_pdrop = config.resid_pdrop
         self.block_idx = block_idx
         self.norm_attn_norm = DbrxNormAttentionNorm(
-            config=config,
+            hidden_size=config.d_model,
+            num_heads=config.n_heads,
+            max_position_embeddings=config.max_seq_len,
+            resid_pdrop=config.resid_pdrop,
+            attn_implementation=config._attn_implementation,
+            attn_config=config.attn_config,
             block_idx=block_idx,
         )
-        self.ffn = DbrxFFN(config=config)
+        self.ffn = DbrxFFN(hidden_size=config.d_model, ffn_config=config.ffn_config)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        position_ids: torch.LongTensor,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: torch.LongTensor = None,
         past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         output_router_logits: Optional[bool] = False,
@@ -944,10 +888,6 @@ def _init_weights(self, module: nn.Module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
-        elif isinstance(module, DbrxExpertGLU):
-            module.w1.data.normal_(mean=0.0, std=std)
-            module.v1.data.normal_(mean=0.0, std=std)
-            module.w2.data.normal_(mean=0.0, std=std)
 
     def _setup_cache(
         self, cache_cls: Any, max_batch_size: int, max_cache_len: int
@@ -980,9 +920,7 @@ class DbrxModel(DbrxPreTrainedModel):
     [`DbrxBlock`] layers.
 
     Args:
-        config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        config: DbrxConfig
     """
 
     def __init__(self, config: DbrxConfig):
@@ -1090,25 +1028,14 @@ def forward(
                 block_outputs = self._gradient_checkpointing_func(
                     block.__call__,
                     hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_values=past_key_values,
-                    output_attentions=output_attentions,
-                    output_router_logits=output_router_logits,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    output_router_logits,
+                    use_cache,
+                    cache_position,
                 )
-                # block_outputs = self._gradient_checkpointing_func(
-                #     block.__call__,
-                #     hidden_states,
-                #     causal_mask,
-                #     position_ids,
-                #     past_key_values,
-                #     output_attentions,
-                #     output_router_logits,
-                #     use_cache,
-                #     cache_position,
-                # )
             else:
                 block_outputs = block(
                     hidden_states,
@@ -1280,8 +1207,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, DbrxForCausalLM
 
-        >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
-        >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct")
+        >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx")
+        >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx")
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")

From 4d9da541760b886dbe0fad89051abfdba96748b8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 03:26:34 +0000
Subject: [PATCH 045/131] add copyright info

---
 src/transformers/models/dbrx/modeling_dbrx.py | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index f7711c0e6c40..097debcd192a 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1,4 +1,18 @@
-"""PyTorch Dbrx model."""
+# coding=utf-8
+# Copyright 2022 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DBRX model. """
 
 import math
 import warnings
@@ -22,15 +36,11 @@
 
 from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
 
-
 if is_flash_attn_2_available():
     try:
         from flash_attn import flash_attn_func, flash_attn_varlen_func
-        from flash_attn.bert_padding import (
-            index_first_axis,
-            pad_input,  # noqa
-            unpad_input,
-        )
+        from flash_attn.bert_padding import pad_input  # noqa
+        from flash_attn.bert_padding import index_first_axis, unpad_input
     except ImportError:
         pass
 

From 9b8f9127d3c4260d8917407f34f2527f0e6e4b33 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 03:32:49 +0000
Subject: [PATCH 046/131] fix imports and DbrxRotaryEmbedding

---
 src/transformers/models/dbrx/modeling_dbrx.py | 54 ++++++++++---------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 097debcd192a..8616a6a90ab6 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -25,51 +25,53 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from transformers.cache_utils import Cache, DynamicCache, StaticCache
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_outputs import (
-    MoeCausalLMOutputWithPast,
-    MoeModelOutputWithPast,
-)
+# from transformers.cache_utils import Cache, DynamicCache, StaticCache
+# from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+# from transformers.modeling_outputs import (
+#     MoeCausalLMOutputWithPast,
+#     MoeModelOutputWithPast,
+# )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import is_flash_attn_2_available, logging
 
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+)
 from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
 
 if is_flash_attn_2_available():
-    try:
-        from flash_attn import flash_attn_func, flash_attn_varlen_func
-        from flash_attn.bert_padding import pad_input  # noqa
-        from flash_attn.bert_padding import index_first_axis, unpad_input
-    except ImportError:
-        pass
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
 
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "DbrxConfig"
 
-#############################################################################
-# Copied from LLaMaRotaryEmbedding
-#############################################################################
-
-
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with Gemma->Dbrx
 class DbrxRotaryEmbedding(nn.Module):
-    def __init__(
-        self, dim: int, max_position_embeddings: int = 2048, base: float = 10000.0, scaling_factor: float = 1.0
-    ):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
-        self.scaling_factor = scaling_factor
+
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # For BC we register cos and sin cached
-        self.max_seq_len_cached = max_position_embeddings
+        self.register_buffer("inv_freq", None, persistent=False)
 
     @torch.no_grad()
-    def forward(self, x: torch.Tensor, position_ids: torch.LongTensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, x, position_ids, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+            )
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts

From 3692a90252842e8298329fdd1819c0d491fbc2af Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 03:34:43 +0000
Subject: [PATCH 047/131] update DbrxModel docstring

---
 src/transformers/models/dbrx/modeling_dbrx.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 8616a6a90ab6..168237dda389 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -932,7 +932,9 @@ class DbrxModel(DbrxPreTrainedModel):
     [`DbrxBlock`] layers.
 
     Args:
-        config: DbrxConfig
+        config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
     """
 
     def __init__(self, config: DbrxConfig):

From f050499310b4357f6f248588c31824a1d562b231 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 03:46:04 +0000
Subject: [PATCH 048/131] use copies

---
 src/transformers/models/dbrx/modeling_dbrx.py | 41 ++++++++-----------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 168237dda389..f93f36290dd7 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -85,17 +85,15 @@ def forward(self, x, position_ids, seq_len=None):
             sin = emb.sin()
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
-
-def rotate_half(x: torch.Tensor) -> torch.Tensor:
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
-
-def apply_rotary_pos_emb(
-    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, unsqueeze_dim: int = 1
-) -> Tuple[torch.Tensor, torch.Tensor]:
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -103,14 +101,15 @@ def apply_rotary_pos_emb(
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos and
-            sin so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos and sin have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos and sin broadcastable to the shapes of q and k. Similarly, if q and k have
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
@@ -120,12 +119,11 @@ def apply_rotary_pos_emb(
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 
-
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """Equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
-
-    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
     """
     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
     if n_rep == 1:
@@ -134,13 +132,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
-#############################################################################
-
-#############################################################################
-# Modified from modeling_mixtral
-#############################################################################
-
-
 def load_balancing_loss_func(
     gate_logits: torch.Tensor,
     num_experts: int,
@@ -247,8 +238,8 @@ def resolve_ffn_act_fn(ffn_act_fn: dict) -> Callable[[torch.Tensor], torch.Tenso
 # Copied from LLaMaAttention
 #############################################################################
 
-
-def _get_unpad_data(attention_mask: torch.Tensor):
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()

From a075df2cc4e8aab67174ce2c67c8d3b38d8879e8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 03:50:39 +0000
Subject: [PATCH 049/131] change model path in docstring

---
 src/transformers/models/dbrx/modeling_dbrx.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index f93f36290dd7..0e3e7e70fa02 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -34,22 +34,24 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import is_flash_attn_2_available, logging
 
-from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
     logging,
 )
 from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
 
+
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import pad_input  # noqa
-    from flash_attn.bert_padding import index_first_axis, unpad_input
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,  # noqa
+        unpad_input,
+    )
 
 logger = logging.get_logger(__name__)
 
@@ -1212,8 +1214,8 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, DbrxForCausalLM
 
-        >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx")
-        >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx")
+        >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct")
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")

From 1dc307386681f88a25cf040487ac0c9e6eb3f809 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 04:17:40 +0000
Subject: [PATCH 050/131] use config in DbrxFFN

---
 src/transformers/models/dbrx/modeling_dbrx.py | 24 +++++++------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 0e3e7e70fa02..47be72e54e73 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -38,20 +38,13 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...utils import (
-    is_flash_attn_2_available,
-    logging,
-)
-from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig, DbrxFFNConfig
-
+from ...utils import is_flash_attn_2_available, logging
+from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig
 
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (
-        index_first_axis,
-        pad_input,  # noqa
-        unpad_input,
-    )
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
 
 logger = logging.get_logger(__name__)
 
@@ -755,11 +748,12 @@ def forward(
 
 
 class DbrxFFN(nn.Module):
-    def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig):
+    def __init__(self, config: DbrxConfig):
         super().__init__()
 
+        ffn_config = config.ffn_config
         self.router = DbrxRouter(
-            hidden_size,
+            hidden_size=config.d_model,
             moe_num_experts=ffn_config.moe_num_experts,
             moe_top_k=ffn_config.moe_top_k,
             moe_jitter_eps=ffn_config.moe_jitter_eps,
@@ -768,7 +762,7 @@ def __init__(self, hidden_size: int, ffn_config: DbrxFFNConfig):
         )
 
         self.experts = DbrxExperts(
-            hidden_size=hidden_size,
+            hidden_size=config.d_model,
             ffn_hidden_size=ffn_config.ffn_hidden_size,
             moe_num_experts=ffn_config.moe_num_experts,
             ffn_act_fn=ffn_config.ffn_act_fn,
@@ -795,7 +789,7 @@ def __init__(self, config: DbrxConfig, block_idx: int):
             attn_config=config.attn_config,
             block_idx=block_idx,
         )
-        self.ffn = DbrxFFN(hidden_size=config.d_model, ffn_config=config.ffn_config)
+        self.ffn = DbrxFFN(config=config)
 
     def forward(
         self,

From 7df83697baa10c9cb2bca950142a8ed11c8846ad Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 04:36:55 +0000
Subject: [PATCH 051/131] fix flashattention2, sdpaattention

---
 src/transformers/models/dbrx/modeling_dbrx.py | 173 +++++++++++++-----
 1 file changed, 131 insertions(+), 42 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 47be72e54e73..9ad070ab5e3a 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -41,10 +41,14 @@
 from ...utils import is_flash_attn_2_available, logging
 from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig
 
+
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import pad_input  # noqa
-    from flash_attn.bert_padding import index_first_axis, unpad_input
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,  # noqa
+        unpad_input,
+    )
 
 logger = logging.get_logger(__name__)
 
@@ -271,6 +275,7 @@ def __init__(
                 + "when creating this class."
             )
 
+        self.is_casual = True
         self.attn_pdrop = attn_config.attn_pdrop
         self.clip_qkv = attn_config.clip_qkv
         self.num_key_value_heads = attn_config.kv_n_heads
@@ -467,31 +472,35 @@ def forward(
 
         return attn_output, attn_weights, past_key_value  # type: ignore
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
-        self,
-        query_states: torch.Tensor,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        attention_mask: Union[torch.LongTensor, None],
-        query_length: int,
-        dropout: float = 0.0,
-        softmax_scale: Optional[float] = None,
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
-        """Use FlashAttention, stripping padding tokens if necessary.
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
 
         Args:
-            query_states (torch.Tensor): Input query states to be passed to Flash Attention API
-            key_states (torch.Tensor): Input key states to be passed to Flash Attention API
-            value_states (torch.Tensor): Input value states to be passed to Flash Attention API
-            attention_mask (torch.LongTensor | None): The padding mask - corresponds to a tensor of size
-                (batch_size, seq_len) where 0 stands for the position of padding tokens and 1
-                for the position of non-padding tokens.
-            query_length (int): The length of the query sequence
-            dropout (float): Attention dropout
-            softmax_scale (float, optional): The scaling of QK^T before applying softmax.
-                Defaults to 1 / sqrt(head_dim)
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
-        causal = True
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -515,32 +524,16 @@ def _flash_attention_forward(
                 causal=causal,
             )
 
-            attn_output = pad_input(
-                attn_output_unpad,
-                indices_q,
-                batch_size,
-                query_length,
-            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states,
-                key_states,
-                value_states,
-                dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
 
-    def _upad_input(
-        self,
-        query_layer: torch.Tensor,
-        key_layer: torch.Tensor,
-        value_layer: torch.Tensor,
-        attention_mask: torch.Tensor,
-        query_length: int,
-    ):
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
@@ -579,9 +572,105 @@ def _upad_input(
         )
 
 
+class DbrxSdpaAttention(DbrxAttention):
+    """
+    Dbrx attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DbrxAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Ignore copy
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "GemmaModel is using GemmaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv_states = self.Wqkv(hidden_states)
+        if self.clip_qkv is not None:
+            qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+
+        query_states, key_states, value_states = qkv_states.split(
+            [
+                self.hidden_size,
+                self.num_key_value_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim,
+            ],
+            dim=2,
+        )
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
+
+        past_key_value = getattr(self, "past_key_value", past_key_value)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attn_pdrop if self.training else 0.0,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
 DBRX_ATTENTION_CLASSES = {
     "eager": DbrxAttention,
     "flash_attention_2": DbrxFlashAttention2,
+    "sdpa": DbrxSdpaAttention,
 }
 
 

From aa8c55d1fd79d197d2fe9df254d56367f67dd9d8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 04:44:29 +0000
Subject: [PATCH 052/131] input config to DbrXAttention, DbrxNormAttentionNorm

---
 src/transformers/models/dbrx/modeling_dbrx.py | 39 +++++++------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 9ad070ab5e3a..143754f27dcc 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -25,12 +25,6 @@
 import torch.utils.checkpoint
 from torch import nn
 
-# from transformers.cache_utils import Cache, DynamicCache, StaticCache
-# from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-# from transformers.modeling_outputs import (
-#     MoeCausalLMOutputWithPast,
-#     MoeModelOutputWithPast,
-# )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import is_flash_attn_2_available, logging
 
@@ -41,14 +35,10 @@
 from ...utils import is_flash_attn_2_available, logging
 from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig
 
-
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (
-        index_first_axis,
-        pad_input,  # noqa
-        unpad_input,
-    )
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
 
 logger = logging.get_logger(__name__)
 
@@ -255,19 +245,16 @@ class DbrxAttention(nn.Module):
 
     def __init__(
         self,
-        hidden_size: int,
-        num_heads: int,
-        max_position_embeddings: int,
-        attn_config: DbrxAttentionConfig,
+        config: DbrxConfig,
         block_idx: Optional[int] = None,
     ):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
+        self.config = config
+        self.hidden_size = config.d_model
+        self.num_heads = config.n_heads
         self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = max_position_embeddings
+        self.max_position_embeddings = config.max_seq_len
         self.block_idx = block_idx
-        self.config = attn_config
         if block_idx is None:
             logger.warning_once(
                 f"Instantiating {self.__class__.__name__} without passing a `block_idx` is not recommended and will "
@@ -275,12 +262,13 @@ def __init__(
                 + "when creating this class."
             )
 
-        self.is_casual = True
+        attn_config = config.attn_config
         self.attn_pdrop = attn_config.attn_pdrop
         self.clip_qkv = attn_config.clip_qkv
         self.num_key_value_heads = attn_config.kv_n_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.rope_theta = attn_config.rope_theta
+        self.is_casual = True
 
         self.Wqkv = nn.Linear(
             self.hidden_size, self.hidden_size + 2 * self.num_key_value_heads * self.head_dim, bias=False
@@ -677,6 +665,7 @@ def forward(
 class DbrxNormAttentionNorm(nn.Module):
     def __init__(
         self,
+        config: DbrxConfig,
         hidden_size: int,
         num_heads: int,
         max_position_embeddings: int,
@@ -689,11 +678,8 @@ def __init__(
         self.block_idx = block_idx
         self.resid_pdrop = resid_pdrop
         self.norm_1 = nn.LayerNorm(hidden_size, bias=False)
-        self.attn = DBRX_ATTENTION_CLASSES[attn_implementation](
-            hidden_size=hidden_size,
-            num_heads=num_heads,
-            max_position_embeddings=max_position_embeddings,
-            attn_config=attn_config,
+        self.attn = DBRX_ATTENTION_CLASSES[config._attn_implementation](
+            config=config,
             block_idx=block_idx,
         )
         self.norm_2 = nn.LayerNorm(hidden_size, bias=False)
@@ -870,6 +856,7 @@ def __init__(self, config: DbrxConfig, block_idx: int):
         self.resid_pdrop = config.resid_pdrop
         self.block_idx = block_idx
         self.norm_attn_norm = DbrxNormAttentionNorm(
+            config=config,
             hidden_size=config.d_model,
             num_heads=config.n_heads,
             max_position_embeddings=config.max_seq_len,

From 4b01cdc844e53370daaed7307548835aad3915c8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 05:02:28 +0000
Subject: [PATCH 053/131] more fixes

---
 src/transformers/models/dbrx/modeling_dbrx.py | 49 ++++++++++---------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 143754f27dcc..7fc70c123a8a 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -25,25 +25,31 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import is_flash_attn_2_available, logging
-
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...utils import is_flash_attn_2_available, logging
-from .configuration_dbrx import DbrxAttentionConfig, DbrxConfig
+from ...utils import (
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+)
+from .configuration_dbrx import DbrxConfig
+
 
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import pad_input  # noqa
-    from flash_attn.bert_padding import index_first_axis, unpad_input
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,  # noqa
+        unpad_input,
+    )
 
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "DbrxConfig"
 
+
 # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with Gemma->Dbrx
 class DbrxRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
@@ -74,6 +80,7 @@ def forward(self, x, position_ids, seq_len=None):
             sin = emb.sin()
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
+
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
@@ -81,6 +88,7 @@ def rotate_half(x):
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
+
 # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
@@ -108,6 +116,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 
+
 # Copied from transformers.models.llama.modeling_llama.repeat_kv
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
@@ -227,6 +236,7 @@ def resolve_ffn_act_fn(ffn_act_fn: dict) -> Callable[[torch.Tensor], torch.Tenso
 # Copied from LLaMaAttention
 #############################################################################
 
+
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -363,6 +373,12 @@ def __init__(self, *args: Any, **kwargs: Any):
 
         super().__init__(*args, **kwargs)
 
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # This module inherits from `DbrxAttention` as the weights of the module stays
+        # From: https://github.com/huggingface/transformers/blob/3b8e2932ce743008f63585aae1e1b8b30dc8b3ac/src/transformers/models/gemma/modeling_gemma.py#L318
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -578,7 +594,6 @@ def forward(
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
@@ -666,23 +681,17 @@ class DbrxNormAttentionNorm(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
-        hidden_size: int,
-        num_heads: int,
-        max_position_embeddings: int,
-        resid_pdrop: float,
-        attn_implementation: str,
-        attn_config: DbrxAttentionConfig,
         block_idx: Optional[int] = None,
     ):
         super().__init__()
         self.block_idx = block_idx
-        self.resid_pdrop = resid_pdrop
-        self.norm_1 = nn.LayerNorm(hidden_size, bias=False)
+        self.resid_pdrop = config.resid_pdrop
+        self.norm_1 = nn.LayerNorm(config.d_model, bias=False)
         self.attn = DBRX_ATTENTION_CLASSES[config._attn_implementation](
             config=config,
             block_idx=block_idx,
         )
-        self.norm_2 = nn.LayerNorm(hidden_size, bias=False)
+        self.norm_2 = nn.LayerNorm(config.d_model, bias=False)
 
     def forward(
         self,
@@ -857,12 +866,6 @@ def __init__(self, config: DbrxConfig, block_idx: int):
         self.block_idx = block_idx
         self.norm_attn_norm = DbrxNormAttentionNorm(
             config=config,
-            hidden_size=config.d_model,
-            num_heads=config.n_heads,
-            max_position_embeddings=config.max_seq_len,
-            resid_pdrop=config.resid_pdrop,
-            attn_implementation=config._attn_implementation,
-            attn_config=config.attn_config,
             block_idx=block_idx,
         )
         self.ffn = DbrxFFN(config=config)

From 1c5816ee159b1803ae0bdfd6128f452d656d296a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 05:06:15 +0000
Subject: [PATCH 054/131] fix

---
 src/transformers/models/dbrx/modeling_dbrx.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 7fc70c123a8a..38f89ff5e8da 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -375,7 +375,7 @@ def __init__(self, *args: Any, **kwargs: Any):
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
         # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # This module inherits from `DbrxAttention` as the weights of the module stays
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         # From: https://github.com/huggingface/transformers/blob/3b8e2932ce743008f63585aae1e1b8b30dc8b3ac/src/transformers/models/gemma/modeling_gemma.py#L318
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 
@@ -747,17 +747,9 @@ def __init__(
 
         self.layer = nn.Linear(self.hidden_size, self.moe_num_experts, bias=False)
 
-    def jitter(self, x: torch.Tensor) -> torch.Tensor:
-        if self.moe_jitter_eps is None:
-            raise RuntimeError("The router does not have moe_jitter_eps set.")
-        low = 1.0 - self.moe_jitter_eps
-        high = 1.0 + self.moe_jitter_eps
-        noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
-        return low + noise * (high - low)
-
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
         if self.training and self.moe_jitter_eps is not None:
-            x = x * self.jitter(x)
+            x *= torch.empty_like(x).uniform_(1.0 - self.moe_jitter_eps, 1.0 + self.moe_jitter_eps)
 
         weights = self.layer(x.view(-1, x.shape[-1])).softmax(dim=-1, dtype=torch.float32)
         top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)

From 09f601ec192a3550a19971eeba4db9b8ecc3b1cd Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 05:10:45 +0000
Subject: [PATCH 055/131] fix again!

---
 src/transformers/models/dbrx/modeling_dbrx.py | 94 ++++++++++---------
 1 file changed, 48 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 38f89ff5e8da..f74e26137e86 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -16,15 +16,14 @@
 
 import math
 import warnings
-from copy import deepcopy
-from functools import partial
-from typing import Any, Callable, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 
+from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -210,33 +209,6 @@ def load_balancing_loss_func(
     return overall_loss * num_experts
 
 
-#############################################################################
-
-
-def resolve_ffn_act_fn(ffn_act_fn: dict) -> Callable[[torch.Tensor], torch.Tensor]:
-    """Resolve the activation function for the feed-forward network.
-
-    Args:
-        ffn_act_fn (dict): The configuration dictionary for the activation function.
-            The dict config must specify the 'name' of a torch.nn.functional activation
-            function. All of other key values pairs are bound to the function as a partial.
-
-    Returns:
-        Callable[[torch.Tensor], torch.Tensor]: The activation function.
-    """
-    config = deepcopy(ffn_act_fn)
-    name = config.pop("name")
-    if not hasattr(nn.functional, name):
-        raise ValueError(f"Unrecognised activation function name ({name}).")
-    act = getattr(nn.functional, name)
-    return partial(act, **config)
-
-
-#############################################################################
-# Copied from LLaMaAttention
-#############################################################################
-
-
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -600,7 +572,6 @@ def forward(
                 "GemmaModel is using GemmaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
-
             return super().forward(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
@@ -687,6 +658,7 @@ def __init__(
         self.block_idx = block_idx
         self.resid_pdrop = config.resid_pdrop
         self.norm_1 = nn.LayerNorm(config.d_model, bias=False)
+        print(f"config._attn_implementation={config._attn_implementation}")
         self.attn = DBRX_ATTENTION_CLASSES[config._attn_implementation](
             config=config,
             block_idx=block_idx,
@@ -774,28 +746,43 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Lo
 
 
 class DbrxExpertGLU(nn.Module):
-    def __init__(self, hidden_size: int, ffn_hidden_size: int, ffn_act_fn: dict):
+    def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict):
         super().__init__()
-        self.w1 = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
-        self.v1 = nn.Linear(hidden_size, ffn_hidden_size, bias=False)
-        self.w2 = nn.Linear(ffn_hidden_size, hidden_size, bias=False)
-        self.activation_fn = resolve_ffn_act_fn(ffn_act_fn)
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+
+        self.w1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.v1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+        self.w2 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
+
+        act_fn_name = ffn_act_fn.pop("name", "silu")
+        if len(ffn_act_fn) != 0:
+            raise ValueError(f"FFN activation function has unhandled kwargs {ffn_act_fn=}")
+        self.activation_fn = ACT2FN[act_fn_name]
+
+    def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
+        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x1 = self.w1(x)
-        x2 = self.v1(x)
-        x1 = self.activation_fn(x1)
-        x1 = x1 * x2
-        x1 = self.w2(x1)
-        return x1
+        gate_proj = x.matmul(expert_w1.t())
+        up_proj = x.matmul(expert_v1.t())
+        gate_proj = self.activation_fn(gate_proj)
+        intermediate_states = gate_proj * up_proj
+        down_proj = intermediate_states.matmul(expert_w2)
+        return down_proj
 
 
 class DbrxExperts(nn.Module):
     def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int, ffn_act_fn: dict):
         super().__init__()
         self.moe_num_experts = moe_num_experts
-        self.mlp_experts = nn.ModuleList(
-            [DbrxExpertGLU(hidden_size, ffn_hidden_size, ffn_act_fn) for _ in range(moe_num_experts)]
+        self.mlp = DbrxExpertGLU(
+            hidden_size=hidden_size,
+            ffn_hidden_size=ffn_hidden_size,
+            moe_num_experts=moe_num_experts,
+            ffn_act_fn=ffn_act_fn,
         )
 
     def forward(
@@ -815,7 +802,7 @@ def forward(
             topk_list = topk_idx.tolist()
 
             expert_tokens = x[None, token_list].reshape(-1, hidden_size)
-            expert_out = self.mlp_experts[expert_idx](expert_tokens) * top_weights[token_list, topk_list, None]
+            expert_out = self.mlp(expert_tokens, expert_idx) * top_weights[token_list, topk_list, None]
 
             out.index_add_(0, token_idx, expert_out)
 
@@ -958,6 +945,10 @@ def _init_weights(self, module: nn.Module):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
+        elif isinstance(module, DbrxExpertGLU):
+            module.w1.data.normal_(mean=0.0, std=std)
+            module.v1.data.normal_(mean=0.0, std=std)
+            module.w2.data.normal_(mean=0.0, std=std)
 
     def _setup_cache(
         self, cache_cls: Any, max_batch_size: int, max_cache_len: int
@@ -1097,6 +1088,17 @@ def forward(
                 all_hidden_states += (hidden_states,)  # type: ignore
 
             if self.gradient_checkpointing and self.training:
+                # block_outputs = self._gradient_checkpointing_func(
+                #     block.__call__,
+                #     hidden_states,
+                #     attention_mask=causal_mask,
+                #     position_ids=position_ids,
+                #     past_key_values=past_key_values,
+                #     output_attentions=output_attentions,
+                #     output_router_logits=output_router_logits,
+                #     use_cache=use_cache,
+                #     cache_position=cache_position,
+                # )
                 block_outputs = self._gradient_checkpointing_func(
                     block.__call__,
                     hidden_states,

From 5a52bb90aa5e821f194b15c3a1c16f58d8a38b7a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 05:17:08 +0000
Subject: [PATCH 056/131] add informative comment

---
 src/transformers/models/dbrx/modeling_dbrx.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index f74e26137e86..a820e5680478 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -35,14 +35,10 @@
 )
 from .configuration_dbrx import DbrxConfig
 
-
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (
-        index_first_axis,
-        pad_input,  # noqa
-        unpad_input,
-    )
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
 
 logger = logging.get_logger(__name__)
 
@@ -1281,8 +1277,11 @@ def forward(
         ```python
         >>> from transformers import AutoTokenizer, DbrxForCausalLM
 
-        >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
-        >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct")
+        # ToDo: change `"eitanturok/dbrx-tiny"` to `"databricks/dbrx-instruct"`
+        # However, `"databricks/dbrx-instruct"` is a gated model which causes issues
+        # with circle.ci
+        >>> model = DbrxForCausalLM.from_pretrained("eitanturok/dbrx-tiny")
+        >>> tokenizer = AutoTokenizer.from_pretrained("eitanturok/dbrx-tiny")
 
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")

From cc6e5d8045cf6fbadfb4a617b346fcd230f2584f Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 05:26:12 +0000
Subject: [PATCH 057/131] fix ruff?

---
 src/transformers/models/dbrx/modeling_dbrx.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index a820e5680478..3ec08b19bcf1 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -35,10 +35,14 @@
 )
 from .configuration_dbrx import DbrxConfig
 
+
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import pad_input  # noqa
-    from flash_attn.bert_padding import index_first_axis, unpad_input
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,  # noqa
+        unpad_input,
+    )
 
 logger = logging.get_logger(__name__)
 

From 4c5e12718467471d09046204f8039464a77b07c9 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 05:28:48 +0000
Subject: [PATCH 058/131] remove print statement + style

---
 src/transformers/models/dbrx/modeling_dbrx.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 3ec08b19bcf1..a41cdc024811 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -658,7 +658,6 @@ def __init__(
         self.block_idx = block_idx
         self.resid_pdrop = config.resid_pdrop
         self.norm_1 = nn.LayerNorm(config.d_model, bias=False)
-        print(f"config._attn_implementation={config._attn_implementation}")
         self.attn = DBRX_ATTENTION_CLASSES[config._attn_implementation](
             config=config,
             block_idx=block_idx,

From 0f562aa62af05fd5e3720e10f3d2e4b3f4d24331 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 05:49:42 +0000
Subject: [PATCH 059/131] change doc-test

---
 src/transformers/models/dbrx/modeling_dbrx.py | 37 ++++++++-----------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index a41cdc024811..69ebd1b828a3 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -35,14 +35,10 @@
 )
 from .configuration_dbrx import DbrxConfig
 
-
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (
-        index_first_axis,
-        pad_input,  # noqa
-        unpad_input,
-    )
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
 
 logger = logging.get_logger(__name__)
 
@@ -1276,24 +1272,23 @@ def forward(
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""Forward function for causal language modeling.
 
-        Example:
-        ```python
-        >>> from transformers import AutoTokenizer, DbrxForCausalLM
+        # ToDo: this doc-test fails in circle.ci because "databricks/dbrx-instruct" is a gated repo
+        # and circle.ci doesn't have access to it
+        # Example:
+        # ```python
+        # >>> from transformers import AutoTokenizer, DbrxForCausalLM
 
-        # ToDo: change `"eitanturok/dbrx-tiny"` to `"databricks/dbrx-instruct"`
-        # However, `"databricks/dbrx-instruct"` is a gated model which causes issues
-        # with circle.ci
-        >>> model = DbrxForCausalLM.from_pretrained("eitanturok/dbrx-tiny")
-        >>> tokenizer = AutoTokenizer.from_pretrained("eitanturok/dbrx-tiny")
+        # >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
+        # >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", trust_remote_code=True)
 
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        # >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        # >>> inputs = tokenizer(prompt, return_tensors="pt")
 
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```
+        # >>> # Generate
+        # >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        # >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        # "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        # ```
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 62a512eeb4aff3dd9633da4c4f0f05c073c3a523 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 05:49:44 +0000
Subject: [PATCH 060/131] fix doc-test

---
 src/transformers/models/dbrx/modeling_dbrx.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 69ebd1b828a3..497f5d03a7ba 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -35,10 +35,14 @@
 )
 from .configuration_dbrx import DbrxConfig
 
+
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import pad_input  # noqa
-    from flash_attn.bert_padding import index_first_axis, unpad_input
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,  # noqa
+        unpad_input,
+    )
 
 logger = logging.get_logger(__name__)
 

From aae804503987deeb9d5f348d2333350e88295eb2 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitanturok@gmail.com>
Date: Mon, 1 Apr 2024 16:08:50 +0000
Subject: [PATCH 061/131] fix docstring

---
 src/transformers/models/dbrx/modeling_dbrx.py | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 497f5d03a7ba..3105650bb3a9 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1276,23 +1276,21 @@ def forward(
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""Forward function for causal language modeling.
 
-        # ToDo: this doc-test fails in circle.ci because "databricks/dbrx-instruct" is a gated repo
-        # and circle.ci doesn't have access to it
-        # Example:
-        # ```python
-        # >>> from transformers import AutoTokenizer, DbrxForCausalLM
-
-        # >>> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
-        # >>> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", trust_remote_code=True)
-
-        # >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        # >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        # >>> # Generate
-        # >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        # >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        # "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        # ```
+        Example:
+        ```python
+        >> from transformers import AutoTokenizer, DbrxForCausalLM
+
+        >> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
+        >> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", trust_remote_code=True)
+
+        >> prompt = "Hey, are you conscious? Can you talk to me?"
+        >> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >> # Generate
+        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From c3870bcf813880132628632854075c7dd1ec1b5d Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 18:52:45 +0000
Subject: [PATCH 062/131] delete commented out text

---
 src/transformers/models/dbrx/modeling_dbrx.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 3105650bb3a9..25ce0329c6da 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1087,17 +1087,6 @@ def forward(
                 all_hidden_states += (hidden_states,)  # type: ignore
 
             if self.gradient_checkpointing and self.training:
-                # block_outputs = self._gradient_checkpointing_func(
-                #     block.__call__,
-                #     hidden_states,
-                #     attention_mask=causal_mask,
-                #     position_ids=position_ids,
-                #     past_key_values=past_key_values,
-                #     output_attentions=output_attentions,
-                #     output_router_logits=output_router_logits,
-                #     use_cache=use_cache,
-                #     cache_position=cache_position,
-                # )
                 block_outputs = self._gradient_checkpointing_func(
                     block.__call__,
                     hidden_states,

From efd10b8b64e87352d7e2cbbb8be8139b161702fd Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 19:14:10 +0000
Subject: [PATCH 063/131] make defaults match dbrx-instruct

---
 .../models/dbrx/configuration_dbrx.py         | 67 +++++++++----------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index fc5a917eafcd..4a8e1ade3f7f 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -38,18 +38,18 @@ class DbrxAttentionConfig(PretrainedConfig):
     Args:
         attn_pdrop (`float`, *optional*, defaults to 0.0):
             The dropout probability for the attention layers.
-        clip_qkv (`float`, *optional*, defualts to None):
+        clip_qkv (`float`, *optional*, defualts to 8.0):
             If not `None`, clip the queries, keys, and values in the attention layer to this value.
-        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
-        rope_theta (float): The base frequency for rope.
+        kv_n_heads (`Optional[int]`, defaults to 8): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (`float`, defaults to 500000): The base frequency for rope.
     """
 
     def __init__(
         self,
-        attn_pdrop: float = 0,
-        clip_qkv: Optional[float] = None,
-        kv_n_heads: int = 1,
-        rope_theta: float = 10000.0,
+        attn_pdrop: float = 0.0,
+        clip_qkv: Optional[float] = 8.0,
+        kv_n_heads: int = 8,
+        rope_theta: float = 500000,
         **kwargs: Any,
     ):
         super().__init__(**kwargs)
@@ -92,34 +92,33 @@ class DbrxFFNConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+        ffn_act_fn (`dict`, defaults to `{"name": "silu"}`): A dict specifying activation function for the FFN.
             The dict should have a key 'name' with the value being the name of
             the activation function along with any additional keyword arguments.
-        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
-        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
-        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
-        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
-        router_aux_loss_coef (float, optional): The loss weight for the mixture of experts layer.
-        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
-        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+        ffn_hidden_size (`int`, defaults to 10752): The hidden size of the feedforward network.
+        moe_num_experts (`int`, defaults to 16): The number of experts in the mixture of experts layer.
+        moe_top_k (`int`, defaults to 4): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (`float`, defaults to 0.0): The jitter epsilon for the mixture of experts layer.
+        router_aux_loss_coef (`float`, defaults to 0.05): The loss weight for the mixture of experts layer.
+        moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
+        uniform_expert_assignment (`bool`, defaults to `False`): Whether to use uniform expert assignment.
             This should only be used for benchmarking purposes.
     """
 
     def __init__(
         self,
-        ffn_act_fn: Optional[dict] = None,
-        ffn_hidden_size: int = 3584,
-        moe_num_experts: int = 4,
-        moe_top_k: int = 1,
-        moe_jitter_eps: Optional[float] = None,
-        router_aux_loss_coef: float = 0.01,
-        moe_normalize_expert_weights: Optional[float] = 1,
+        ffn_act_fn: dict = {"name": "silu"},
+        ffn_hidden_size: int = 10752,
+        moe_num_experts: int = 16,
+        moe_top_k: int = 4,
+        moe_jitter_eps: float = 0.0,
+        router_aux_loss_coef: float = 0.05,
+        moe_normalize_expert_weights: Optional[float] = 1.0,
         uniform_expert_assignment: bool = False,
         **kwargs: Any,
     ):
         super().__init__()
-        if ffn_act_fn is None:
-            ffn_act_fn = {"name": "silu"}
+
         self.ffn_act_fn = ffn_act_fn
         self.ffn_hidden_size = ffn_hidden_size
         self.moe_num_experts = moe_num_experts
@@ -167,15 +166,15 @@ class DbrxConfig(PretrainedConfig):
 
 
     Args:
-        d_model (`int`, *optional*, defaults to 2048):
+        d_model (`int`, *optional*, defaults to 6144):
             Dimensionality of the embeddings and hidden states.
-        n_heads (`int`, *optional*, defaults to 16):
+        n_heads (`int`, *optional*, defaults to 48):
             Number of attention heads for each attention layer in the Transformer encoder.
-        n_layers (`int`, *optional*, defaults to 24):
+        n_layers (`int`, *optional*, defaults to 40):
             Number of hidden layers in the Transformer encoder.
-        max_seq_len (`int`, *optional*, defaults to 2048):
+        max_seq_len (`int`, *optional*, defaults to 32768):
             The maximum sequence length of the model.
-        vocab_size (`int`, *optional*, defaults to 32000):
+        vocab_size (`int`, *optional*, defaults to 100352):
             Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
             the `inputs_ids` passed when calling [`DbrxModel`].
         resid_pdrop (`float`, *optional*, defaults to 0.0):
@@ -222,11 +221,11 @@ class DbrxConfig(PretrainedConfig):
 
     def __init__(
         self,
-        d_model: int = 2048,
-        n_heads: int = 16,
-        n_layers: int = 24,
-        max_seq_len: int = 2048,
-        vocab_size: int = 32000,
+        d_model: int = 6144,
+        n_heads: int = 48,
+        n_layers: int = 40,
+        max_seq_len: int = 32768,
+        vocab_size: int = 100352,
         resid_pdrop: float = 0.0,
         emb_pdrop: float = 0.0,
         attn_config: Optional[DbrxAttentionConfig] = None,

From ea836a86f376f78434c655799f4f06ac6ad60649 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 20:44:45 +0000
Subject: [PATCH 064/131] replace `router_aux_loss_coef` with `moe_loss_weight`

---
 .../models/dbrx/configuration_dbrx.py            | 16 +++-------------
 src/transformers/models/dbrx/modeling_dbrx.py    |  4 ++--
 tests/models/dbrx/test_modeling_dbrx.py          | 11 ++++-------
 3 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 4a8e1ade3f7f..6fa6d6ae1b43 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -99,7 +99,7 @@ class DbrxFFNConfig(PretrainedConfig):
         moe_num_experts (`int`, defaults to 16): The number of experts in the mixture of experts layer.
         moe_top_k (`int`, defaults to 4): The number of experts to use in the mixture of experts layer.
         moe_jitter_eps (`float`, defaults to 0.0): The jitter epsilon for the mixture of experts layer.
-        router_aux_loss_coef (`float`, defaults to 0.05): The loss weight for the mixture of experts layer.
+        moe_loss_weight (`float`, defaults to 0.05): The loss weight for the mixture of experts layer.
         moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
         uniform_expert_assignment (`bool`, defaults to `False`): Whether to use uniform expert assignment.
             This should only be used for benchmarking purposes.
@@ -112,7 +112,7 @@ def __init__(
         moe_num_experts: int = 16,
         moe_top_k: int = 4,
         moe_jitter_eps: float = 0.0,
-        router_aux_loss_coef: float = 0.05,
+        moe_loss_weight: float = 0.05,
         moe_normalize_expert_weights: Optional[float] = 1.0,
         uniform_expert_assignment: bool = False,
         **kwargs: Any,
@@ -124,9 +124,7 @@ def __init__(
         self.moe_num_experts = moe_num_experts
         self.moe_top_k = moe_top_k
         self.moe_jitter_eps = moe_jitter_eps
-        self.router_aux_loss_coef = (
-            router_aux_loss_coef if "moe_loss_weight" not in kwargs else kwargs["moe_loss_weight"]
-        )
+        self.moe_loss_weight = moe_loss_weight
         self.moe_normalize_expert_weights = moe_normalize_expert_weights
         self.uniform_expert_assignment = uniform_expert_assignment
 
@@ -192,8 +190,6 @@ class DbrxConfig(PretrainedConfig):
         output_router_logits (`bool`, *optional*, defaults to `False`):
             Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss. See [here]() for more details
-        router_aux_loss_coef (`float`, *optional*, defaults to 0.05):
-            The aux loss factor for the total loss.
 
 
     Example:
@@ -233,7 +229,6 @@ def __init__(
         use_cache: bool = True,
         initializer_range: float = 0.02,
         output_router_logits: bool = False,
-        router_aux_loss_coef: float = 0.05,
         **kwargs: Any,
     ):
         if attn_config is None:
@@ -246,10 +241,6 @@ def __init__(
         if ffn_config is None:
             self.ffn_config = DbrxFFNConfig()
         elif isinstance(ffn_config, dict):
-            # use router_aux_loss_coef over ffn_config["moe_loss_weight"]
-            if "moe_loss_weight" in ffn_config and "router_aux_loss_coef" not in ffn_config:
-                ffn_config["router_aux_loss_coef"] = ffn_config["moe_loss_weight"]
-                del ffn_config["moe_loss_weight"]
             self.ffn_config = DbrxFFNConfig(**ffn_config)
         else:
             self.ffn_config = ffn_config
@@ -264,7 +255,6 @@ def __init__(
         self.use_cache = use_cache
         self.initializer_range = initializer_range
         self.output_router_logits = output_router_logits
-        self.router_aux_loss_coef = router_aux_loss_coef
 
         tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
         if tie_word_embeddings:
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 25ce0329c6da..e0f5843874ba 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1223,7 +1223,7 @@ def __init__(self, config: DbrxConfig):
         self.transformer = DbrxModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.moe_loss_weight = config.ffn_config.moe_loss_weight
         self.num_experts = config.ffn_config.moe_num_experts
         self.num_experts_per_tok = config.ffn_config.moe_top_k
 
@@ -1330,7 +1330,7 @@ def forward(
                 attention_mask,
             )
             if labels is not None and loss is not None:
-                loss += self.router_aux_loss_coef * aux_loss.to(loss.device)  # make sure to reside in the same device
+                loss += self.moe_loss_weight * aux_loss.to(loss.device)  # make sure to reside in the same device
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 266c94d9bff2..f2e20077f3cf 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -60,12 +60,11 @@ def __init__(
         moe_loss_weight=0.05,
         moe_num_experts=16,
         moe_top_k=4,
-        fnn_config_model_type="",
+        ffn_config_model_type="",
         ffn_act_fn_name="gelu",
         initializer_range=0.02,
         output_router_logits=False,
         resid_pdrop=0.0,
-        router_aux_loss_coef=0.05,
         tie_word_embeddings=False,
         torch_dtype="bfloat16",
         vocab_size=99,
@@ -90,13 +89,13 @@ def __init__(
         self.rope_theta = rope_theta
         self.attn_config_model_type = attn_config_model_type
 
-        # fnn_config params
+        # ffn_config params
         self.ffn_hidden_size = ffn_hidden_size
         self.moe_jitter_eps = moe_jitter_eps
         self.moe_loss_weight = moe_loss_weight
         self.moe_num_experts = moe_num_experts
         self.moe_top_k = moe_top_k
-        self.fnn_config_model_type = fnn_config_model_type
+        self.ffn_config_model_type = ffn_config_model_type
         self.ffn_act_fn_name = ffn_act_fn_name
 
         # Other params
@@ -110,7 +109,6 @@ def __init__(
         self.emb_pdrop = emb_pdrop
         self.output_router_logits = output_router_logits
         self.resid_pdrop = resid_pdrop
-        self.router_aux_loss_coef = router_aux_loss_coef
         self.tie_word_embeddings = tie_word_embeddings
         self.torch_dtype = torch_dtype
 
@@ -121,7 +119,7 @@ def __init__(
             "moe_loss_weight": self.moe_loss_weight,
             "moe_num_experts": self.moe_num_experts,
             "moe_top_k": self.moe_top_k,
-            "model_type": self.fnn_config_model_type,
+            "model_type": self.ffn_config_model_type,
             "ffn_act_fn": {"name": self.ffn_act_fn_name},
         }
         self.attn_config = {
@@ -172,7 +170,6 @@ def get_config(self):
             use_cache=self.use_cache,
             initializer_range=self.initializer_range,
             output_router_logits=self.output_router_logits,
-            router_aux_loss_coef=self.router_aux_loss_coef,
             is_decoder=False,
         )
         return config

From c46e06bc85ab0440e78d7ec843f101898c9c6845 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 21:52:02 +0000
Subject: [PATCH 065/131] is_decoder=True

---
 .../models/dbrx/configuration_dbrx.py         |  7 ++++++-
 tests/models/dbrx/test_modeling_dbrx.py       | 20 +++++++++++++++----
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 6fa6d6ae1b43..710bbf5d9332 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -189,7 +189,9 @@ class DbrxConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         output_router_logits (`bool`, *optional*, defaults to `False`):
             Whether or not the router logits should be returned by the model. Enabling this will also
-            allow the model to output the auxiliary loss. See [here]() for more details
+            allow the model to output the auxiliary loss. See [here]() for more details.
+        is_decoder (`bool`, defaults to `True`):  Whether the model is used as decoder or not (in which case
+            it’s used as an encoder).
 
 
     Example:
@@ -229,6 +231,7 @@ def __init__(
         use_cache: bool = True,
         initializer_range: float = 0.02,
         output_router_logits: bool = False,
+        is_decoder: bool = True,
         **kwargs: Any,
     ):
         if attn_config is None:
@@ -255,6 +258,7 @@ def __init__(
         self.use_cache = use_cache
         self.initializer_range = initializer_range
         self.output_router_logits = output_router_logits
+        self.is_decoder = is_decoder
 
         tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
         if tie_word_embeddings:
@@ -262,5 +266,6 @@ def __init__(
 
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
+            is_decoder=self.is_decoder,
             **kwargs,
         )
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index f2e20077f3cf..5e2dd904b4f4 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -18,7 +18,11 @@
 import unittest
 
 from transformers import DbrxConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import (
+    require_torch,
+    slow,
+    torch_device,
+)
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -68,6 +72,7 @@ def __init__(
         tie_word_embeddings=False,
         torch_dtype="bfloat16",
         vocab_size=99,
+        is_decoder=True,
     ):
         # Parameters unique to testing
         self.batch_size = batch_size
@@ -98,7 +103,7 @@ def __init__(
         self.ffn_config_model_type = ffn_config_model_type
         self.ffn_act_fn_name = ffn_act_fn_name
 
-        # Other params
+        # Other model params
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
@@ -111,6 +116,7 @@ def __init__(
         self.resid_pdrop = resid_pdrop
         self.tie_word_embeddings = tie_word_embeddings
         self.torch_dtype = torch_dtype
+        self.is_decoder = is_decoder
 
         # Make the dictionaries
         self.ffn_config = {
@@ -170,7 +176,7 @@ def get_config(self):
             use_cache=self.use_cache,
             initializer_range=self.initializer_range,
             output_router_logits=self.output_router_logits,
-            is_decoder=False,
+            is_decoder=self.is_decoder,
         )
         return config
 
@@ -322,7 +328,7 @@ class DbrxModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = DbrxModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DbrxConfig, d_model=37)
+        self.config_tester = ConfigTester(self, config_class=DbrxConfig, d_model=37, is_decoder=True)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -343,6 +349,12 @@ def test_model_from_pretrained(self):
         model = DbrxModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    # @is_flaky(max_attempts=3, description="flaky on some models.")
+    # @require_torch_sdpa
+    # @slow
+    # def test_eager_matches_sdpa_generate(self):
+    #     super().test_eager_matches_sdpa_generate()
+
 
 @require_torch
 class DbrxModelIntegrationTest(unittest.TestCase):

From aab6fd6a6096643c2c783fc90a5afb60e8a56fc6 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 21:53:29 +0000
Subject: [PATCH 066/131] remove is_decoder from configtester

---
 tests/models/dbrx/test_modeling_dbrx.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 5e2dd904b4f4..f0fc3799fb09 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -18,11 +18,7 @@
 import unittest
 
 from transformers import DbrxConfig, is_torch_available
-from transformers.testing_utils import (
-    require_torch,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -328,7 +324,7 @@ class DbrxModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = DbrxModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=DbrxConfig, d_model=37, is_decoder=True)
+        self.config_tester = ConfigTester(self, config_class=DbrxConfig, d_model=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()

From 179834b0ea059b8fa8355d2817bca14ed1b75209 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 22:02:48 +0000
Subject: [PATCH 067/131] implement sdpa properly

---
 src/transformers/models/dbrx/modeling_dbrx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index e0f5843874ba..f309533157aa 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -609,7 +609,7 @@ def forward(
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            key_states, value_states = past_key_value.update(key_states, value_states, self.block_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
@@ -636,7 +636,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.view(bsz, q_len, -1)
 
-        attn_output = self.o_proj(attn_output)
+        attn_output = self.out_proj(attn_output)
 
         return attn_output, None, past_key_value
 
@@ -927,7 +927,7 @@ class DbrxPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["DbrxBlock"]
     _skip_keys_device_placement = ["past_key_values"]
     _supports_flash_attn_2 = True
-    _supports_sdpa = False
+    _supports_sdpa = True
     _supports_cache_class = True
 
     def _init_weights(self, module: nn.Module):

From f053b7b1a980421612ef23410daee8125abea1a7 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 22:14:27 +0000
Subject: [PATCH 068/131] make is_decoder pass tests

---
 src/transformers/models/dbrx/configuration_dbrx.py | 2 +-
 src/transformers/models/dbrx/modeling_dbrx.py      | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 710bbf5d9332..7cdbd49ef1a9 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -190,7 +190,7 @@ class DbrxConfig(PretrainedConfig):
         output_router_logits (`bool`, *optional*, defaults to `False`):
             Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss. See [here]() for more details.
-        is_decoder (`bool`, defaults to `True`):  Whether the model is used as decoder or not (in which case
+        is_decoder (`bool`, defaults to `True`, *optional*, defaults to `True`):  Whether the model is used as decoder or not (in which case
             it’s used as an encoder).
 
 
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index f309533157aa..4bbaae9e6e5c 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -229,10 +229,12 @@ def __init__(
         self,
         config: DbrxConfig,
         block_idx: Optional[int] = None,
+        is_decoder: bool = True,
     ):
         super().__init__()
         self.config = config
         self.hidden_size = config.d_model
+        self.is_decoder = is_decoder
         self.num_heads = config.n_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.max_position_embeddings = config.max_seq_len
@@ -661,6 +663,7 @@ def __init__(
         self.attn = DBRX_ATTENTION_CLASSES[config._attn_implementation](
             config=config,
             block_idx=block_idx,
+            is_decoder=config.is_decoder,
         )
         self.norm_2 = nn.LayerNorm(config.d_model, bias=False)
 

From cdea470fdb2e13a77d434de05db99cf6c6951e62 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 22:17:36 +0000
Subject: [PATCH 069/131] start on the GenerationTesterMixin tests

---
 tests/models/dbrx/test_modeling_dbrx.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index f0fc3799fb09..a14dcee6e261 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -20,6 +20,7 @@
 from transformers import DbrxConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
@@ -316,7 +317,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DbrxModelTest(ModelTesterMixin, unittest.TestCase):
+class DbrxModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (DbrxModel, DbrxForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (DbrxForCausalLM,) if is_torch_available() else ()
     test_headmasking = False

From b7dafdd833c995f93c9fe786c6ff19c11191ea33 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 22:24:33 +0000
Subject: [PATCH 070/131] add dbrx to sdpa documentation

---
 docs/source/en/perf_infer_gpu_one.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 011d38da558d..cbf4a9db5018 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -177,9 +177,10 @@ PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.o
 For now, Transformers supports SDPA inference and training for the following architectures:
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
-* [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
+* [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
+* [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
 * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
 * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
 * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)

From 351bff2a9c9f29db3a3cea1c9dc705266eea735f Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 22:32:45 +0000
Subject: [PATCH 071/131] skip weight typing test

---
 src/transformers/models/dbrx/modeling_dbrx.py | 9 ++-------
 tests/models/dbrx/test_modeling_dbrx.py       | 5 ++++-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 4bbaae9e6e5c..99bf08406c2d 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -35,14 +35,10 @@
 )
 from .configuration_dbrx import DbrxConfig
 
-
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (
-        index_first_axis,
-        pad_input,  # noqa
-        unpad_input,
-    )
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
 
 logger = logging.get_logger(__name__)
 
@@ -1219,7 +1215,6 @@ def _update_causal_mask(
 
 
 class DbrxForCausalLM(DbrxPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
 
     def __init__(self, config: DbrxConfig):
         super().__init__(config)
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index a14dcee6e261..cda6c5e742cc 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -24,7 +24,6 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
-
 if is_torch_available():
     import torch
 
@@ -346,6 +345,10 @@ def test_model_from_pretrained(self):
         model = DbrxModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @unittest.skip("Dbrx models have weight tying disabled.")
+    def test_tied_weights_keys(self):
+        pass
+
     # @is_flaky(max_attempts=3, description="flaky on some models.")
     # @require_torch_sdpa
     # @slow

From fca26d486469b2269b7eeac48d040062cbd18ace Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 1 Apr 2024 22:43:34 +0000
Subject: [PATCH 072/131] style

---
 src/transformers/models/dbrx/modeling_dbrx.py | 9 ++++++---
 tests/models/dbrx/test_modeling_dbrx.py       | 1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 99bf08406c2d..51e56cc5371a 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -35,10 +35,14 @@
 )
 from .configuration_dbrx import DbrxConfig
 
+
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import pad_input  # noqa
-    from flash_attn.bert_padding import index_first_axis, unpad_input
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,  # noqa
+        unpad_input,
+    )
 
 logger = logging.get_logger(__name__)
 
@@ -1215,7 +1219,6 @@ def _update_causal_mask(
 
 
 class DbrxForCausalLM(DbrxPreTrainedModel):
-
     def __init__(self, config: DbrxConfig):
         super().__init__(config)
         self.transformer = DbrxModel(config)
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index cda6c5e742cc..1639aab13f13 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -24,6 +24,7 @@
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
+
 if is_torch_available():
     import torch
 

From cfef3ec86237c3ab2a5bdd80ad9010c80696e5ad Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Tue, 2 Apr 2024 12:01:11 -0400
Subject: [PATCH 073/131] initialize smaller model

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
---
 src/transformers/models/dbrx/configuration_dbrx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 7cdbd49ef1a9..a128fe7e4504 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -199,7 +199,7 @@ class DbrxConfig(PretrainedConfig):
     >>> from transformers import DbrxConfig, DbrxModel
 
     >>> # Initializing a Dbrx configuration
-    >>> configuration = DbrxConfig()
+    >>> configuration = DbrxConfig(n_layers=2, d_model=256, n_heads=8, vocab_size=128)
 
     >>> # Initializing a model (with random weights) from the configuration
     >>> model = DbrxModel(configuration)

From d0f7bef0298affec93ab4c9eec138fbfd1f4fbc6 Mon Sep 17 00:00:00 2001
From: Matt <rocketknight1@gmail.com>
Date: Tue, 2 Apr 2024 17:39:49 +0100
Subject: [PATCH 074/131] Add DBRX to toctree

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 92ee8eeda447..4b271c61a96f 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -320,6 +320,8 @@
         title: CPMANT
       - local: model_doc/ctrl
         title: CTRL
+      - local: model_doc/dbrx
+        title: DBRX
       - local: model_doc/deberta
         title: DeBERTa
       - local: model_doc/deberta-v2

From 99dcef7a6d2f1789f0d83d107e9f8df18302ff64 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Tue, 2 Apr 2024 19:19:34 +0000
Subject: [PATCH 075/131] skip test_new_cache_format

---
 tests/models/dbrx/test_modeling_dbrx.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 1639aab13f13..0b526766815f 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -17,6 +17,8 @@
 
 import unittest
 
+from parameterized import parameterized
+
 from transformers import DbrxConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
@@ -350,11 +352,10 @@ def test_model_from_pretrained(self):
     def test_tied_weights_keys(self):
         pass
 
-    # @is_flaky(max_attempts=3, description="flaky on some models.")
-    # @require_torch_sdpa
-    # @slow
-    # def test_eager_matches_sdpa_generate(self):
-    #     super().test_eager_matches_sdpa_generate()
+    @unittest.skip("TODO @gante fix this for Llama")
+    @parameterized.expand([(1, False), (1, True), (4, False)])
+    def test_new_cache_format(self, num_beams, do_sample):
+        pass
 
 
 @require_torch

From fb5ed6769f3747ce16d456f189d7d15394e55002 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Tue, 2 Apr 2024 19:55:45 +0000
Subject: [PATCH 076/131] make config defaults smaller again

---
 .../models/dbrx/configuration_dbrx.py         | 65 ++++++++++---------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index a128fe7e4504..a943f6456019 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -38,18 +38,18 @@ class DbrxAttentionConfig(PretrainedConfig):
     Args:
         attn_pdrop (`float`, *optional*, defaults to 0.0):
             The dropout probability for the attention layers.
-        clip_qkv (`float`, *optional*, defualts to 8.0):
+        clip_qkv (`float`, *optional*, defualts to `None`):
             If not `None`, clip the queries, keys, and values in the attention layer to this value.
-        kv_n_heads (`Optional[int]`, defaults to 8): For grouped_query_attention only, allow user to specify number of kv heads.
-        rope_theta (`float`, defaults to 500000): The base frequency for rope.
+        kv_n_heads (`Optional[int]`, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (`float`, defaults to 10000.0): The base frequency for rope.
     """
 
     def __init__(
         self,
         attn_pdrop: float = 0.0,
-        clip_qkv: Optional[float] = 8.0,
-        kv_n_heads: int = 8,
-        rope_theta: float = 500000,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
         **kwargs: Any,
     ):
         super().__init__(**kwargs)
@@ -92,14 +92,14 @@ class DbrxFFNConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        ffn_act_fn (`dict`, defaults to `{"name": "silu"}`): A dict specifying activation function for the FFN.
-            The dict should have a key 'name' with the value being the name of
-            the activation function along with any additional keyword arguments.
-        ffn_hidden_size (`int`, defaults to 10752): The hidden size of the feedforward network.
-        moe_num_experts (`int`, defaults to 16): The number of experts in the mixture of experts layer.
-        moe_top_k (`int`, defaults to 4): The number of experts to use in the mixture of experts layer.
-        moe_jitter_eps (`float`, defaults to 0.0): The jitter epsilon for the mixture of experts layer.
-        moe_loss_weight (`float`, defaults to 0.05): The loss weight for the mixture of experts layer.
+        ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
+            The dict should have a key 'name' with the value being the name of the activation function along with
+            any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
+        ffn_hidden_size (`int`, defaults to 3584): The hidden size of the feedforward network.
+        moe_num_experts (`int`, defaults to 4): The number of experts in the mixture of experts layer.
+        moe_top_k (`int`, defaults to 1): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
+        moe_loss_weight (`float`, defaults to 0.01): The loss weight for the mixture of experts layer.
         moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
         uniform_expert_assignment (`bool`, defaults to `False`): Whether to use uniform expert assignment.
             This should only be used for benchmarking purposes.
@@ -107,18 +107,19 @@ class DbrxFFNConfig(PretrainedConfig):
 
     def __init__(
         self,
-        ffn_act_fn: dict = {"name": "silu"},
-        ffn_hidden_size: int = 10752,
-        moe_num_experts: int = 16,
-        moe_top_k: int = 4,
-        moe_jitter_eps: float = 0.0,
-        moe_loss_weight: float = 0.05,
+        ffn_act_fn: dict = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
         moe_normalize_expert_weights: Optional[float] = 1.0,
         uniform_expert_assignment: bool = False,
         **kwargs: Any,
     ):
         super().__init__()
-
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
         self.ffn_act_fn = ffn_act_fn
         self.ffn_hidden_size = ffn_hidden_size
         self.moe_num_experts = moe_num_experts
@@ -157,22 +158,22 @@ class DbrxConfig(PretrainedConfig):
 
     This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
     specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) architecture.
+    defaults will yield a different configuration to that of the [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        d_model (`int`, *optional*, defaults to 6144):
+        d_model (`int`, *optional*, defaults to 2048):
             Dimensionality of the embeddings and hidden states.
-        n_heads (`int`, *optional*, defaults to 48):
+        n_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        n_layers (`int`, *optional*, defaults to 40):
+        n_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        max_seq_len (`int`, *optional*, defaults to 32768):
+        max_seq_len (`int`, *optional*, defaults to 2048):
             The maximum sequence length of the model.
-        vocab_size (`int`, *optional*, defaults to 100352):
+        vocab_size (`int`, *optional*, defaults to 32000):
             Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
             the `inputs_ids` passed when calling [`DbrxModel`].
         resid_pdrop (`float`, *optional*, defaults to 0.0):
@@ -219,11 +220,11 @@ class DbrxConfig(PretrainedConfig):
 
     def __init__(
         self,
-        d_model: int = 6144,
-        n_heads: int = 48,
-        n_layers: int = 40,
-        max_seq_len: int = 32768,
-        vocab_size: int = 100352,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
         resid_pdrop: float = 0.0,
         emb_pdrop: float = 0.0,
         attn_config: Optional[DbrxAttentionConfig] = None,

From 24b28b5e46db3324d44a22fcba0d485554c92b52 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Tue, 2 Apr 2024 20:17:01 +0000
Subject: [PATCH 077/131] add pad_token_id

---
 src/transformers/models/dbrx/configuration_dbrx.py | 7 +++++--
 tests/models/dbrx/test_modeling_dbrx.py            | 3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index a943f6456019..e2e8ce1ef095 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -193,6 +193,8 @@ class DbrxConfig(PretrainedConfig):
             allow the model to output the auxiliary loss. See [here]() for more details.
         is_decoder (`bool`, defaults to `True`, *optional*, defaults to `True`):  Whether the model is used as decoder or not (in which case
             it’s used as an encoder).
+        pad_token_id (`int`, *optional*):
+            Padding token id.
 
 
     Example:
@@ -233,6 +235,7 @@ def __init__(
         initializer_range: float = 0.02,
         output_router_logits: bool = False,
         is_decoder: bool = True,
+        pad_token_id: Optional[int] = None,
         **kwargs: Any,
     ):
         if attn_config is None:
@@ -259,7 +262,6 @@ def __init__(
         self.use_cache = use_cache
         self.initializer_range = initializer_range
         self.output_router_logits = output_router_logits
-        self.is_decoder = is_decoder
 
         tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
         if tie_word_embeddings:
@@ -267,6 +269,7 @@ def __init__(
 
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
-            is_decoder=self.is_decoder,
+            is_decoder=is_decoder,
+            pad_token_id=pad_token_id,
             **kwargs,
         )
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 0b526766815f..6039f9f3a148 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -72,6 +72,7 @@ def __init__(
         torch_dtype="bfloat16",
         vocab_size=99,
         is_decoder=True,
+        pad_token_id=0,
     ):
         # Parameters unique to testing
         self.batch_size = batch_size
@@ -116,6 +117,7 @@ def __init__(
         self.tie_word_embeddings = tie_word_embeddings
         self.torch_dtype = torch_dtype
         self.is_decoder = is_decoder
+        self.pad_token_id = pad_token_id
 
         # Make the dictionaries
         self.ffn_config = {
@@ -176,6 +178,7 @@ def get_config(self):
             initializer_range=self.initializer_range,
             output_router_logits=self.output_router_logits,
             is_decoder=self.is_decoder,
+            pad_token_id=self.pad_token_id,
         )
         return config
 

From f57e672aa677107e7c7cd07c628b0babbc938c08 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Tue, 2 Apr 2024 20:29:53 +0000
Subject: [PATCH 078/131] remove pad_token_id from config

---
 src/transformers/models/dbrx/configuration_dbrx.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index e2e8ce1ef095..fa3430ef74b4 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -193,8 +193,6 @@ class DbrxConfig(PretrainedConfig):
             allow the model to output the auxiliary loss. See [here]() for more details.
         is_decoder (`bool`, defaults to `True`, *optional*, defaults to `True`):  Whether the model is used as decoder or not (in which case
             it’s used as an encoder).
-        pad_token_id (`int`, *optional*):
-            Padding token id.
 
 
     Example:
@@ -235,7 +233,6 @@ def __init__(
         initializer_range: float = 0.02,
         output_router_logits: bool = False,
         is_decoder: bool = True,
-        pad_token_id: Optional[int] = None,
         **kwargs: Any,
     ):
         if attn_config is None:
@@ -270,6 +267,5 @@ def __init__(
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             is_decoder=is_decoder,
-            pad_token_id=pad_token_id,
             **kwargs,
         )

From 6b6655d3c0ff32936d32ef317692d0e80431b848 Mon Sep 17 00:00:00 2001
From: Matt <rocketknight1@gmail.com>
Date: Wed, 3 Apr 2024 13:06:06 +0100
Subject: [PATCH 079/131] Remove all references to
 DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP

---
 src/transformers/__init__.py                        | 6 ++----
 src/transformers/models/dbrx/__init__.py            | 4 ++--
 src/transformers/models/dbrx/configuration_dbrx.py  | 3 ---
 src/transformers/models/deprecated/_archive_maps.py | 3 ---
 src/transformers/utils/dummy_pt_objects.py          | 3 ---
 5 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a582df36fe80..acc9599d385b 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -327,7 +327,7 @@
         "Data2VecTextConfig",
         "Data2VecVisionConfig",
     ],
-    "models.dbrx": ["DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP", "DbrxConfig"],
+    "models.dbrx": ["DbrxConfig"],
     "models.deberta": [
         "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "DebertaConfig",
@@ -1937,7 +1937,6 @@
     )
     _import_structure["models.dbrx"].extend(
         [
-            "DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP",
             "DbrxForCausalLM",
             "DbrxModel",
             "DbrxPreTrainedModel",
@@ -5227,7 +5226,7 @@
         Data2VecTextConfig,
         Data2VecVisionConfig,
     )
-    from .models.dbrx import DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP, DbrxConfig
+    from .models.dbrx import DbrxConfig
     from .models.deberta import (
         DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
         DebertaConfig,
@@ -6753,7 +6752,6 @@
 
         # PyTorch model imports
         from .models.dbrx import (
-            DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP,
             DbrxForCausalLM,
             DbrxModel,
             DbrxPreTrainedModel,
diff --git a/src/transformers/models/dbrx/__init__.py b/src/transformers/models/dbrx/__init__.py
index 75548996fb55..9b1e325896bb 100644
--- a/src/transformers/models/dbrx/__init__.py
+++ b/src/transformers/models/dbrx/__init__.py
@@ -17,7 +17,7 @@
 
 
 _import_structure = {
-    "configuration_dbrx": ["DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP", "DbrxConfig"],
+    "configuration_dbrx": ["DbrxConfig"],
 }
 
 try:
@@ -34,7 +34,7 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_dbrx import DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP, DbrxConfig
+    from .configuration_dbrx import DbrxConfig
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index fa3430ef74b4..86759f7ca030 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -18,9 +18,6 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..deprecated._archive_maps import (  # noqa: F401, E402
-    DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP,
-)
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/deprecated/_archive_maps.py b/src/transformers/models/deprecated/_archive_maps.py
index 2e8a7fc67893..f7b0679a3e4f 100644
--- a/src/transformers/models/deprecated/_archive_maps.py
+++ b/src/transformers/models/deprecated/_archive_maps.py
@@ -532,8 +532,6 @@ def __getitem__(self, item):
 
 DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST = DeprecatedList(["facebook/data2vec-vision-base-ft1k"])
 
-DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = DeprecatedDict({})
-
 DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = DeprecatedDict(
     {
         "microsoft/deberta-base": "https://huggingface.co/microsoft/deberta-base/resolve/main/config.json",
@@ -2582,7 +2580,6 @@ def __getitem__(self, item):
         ("data2vec-audio", "DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("data2vec-text", "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("data2vec-vision", "DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("dbrx", "DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("deformable_detr", "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f734a2faff23..68d1dca167f0 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2457,9 +2457,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = None
-
-
 class DbrxForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 

From 88f350f792b3713431164ae81ff2fd28161c116f Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Wed, 3 Apr 2024 13:15:42 -0400
Subject: [PATCH 080/131] Update src/transformers/models/dbrx/__init__.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/models/dbrx/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/__init__.py b/src/transformers/models/dbrx/__init__.py
index 9b1e325896bb..693a544c4b3d 100644
--- a/src/transformers/models/dbrx/__init__.py
+++ b/src/transformers/models/dbrx/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From a6c21ebb4b36f50cf4cb376166f1335e929ba3c0 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Wed, 3 Apr 2024 13:19:08 -0400
Subject: [PATCH 081/131] Update src/transformers/models/dbrx/modeling_dbrx.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/models/dbrx/modeling_dbrx.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 51e56cc5371a..8418783f218e 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -557,7 +557,6 @@ class DbrxSdpaAttention(DbrxAttention):
     SDPA API.
     """
 
-    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,

From a91e45f9239fe87fdea540435178e8ec0909b6fe Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Wed, 3 Apr 2024 13:20:14 -0400
Subject: [PATCH 082/131] Update docs/source/en/model_doc/dbrx.md

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
---
 docs/source/en/model_doc/dbrx.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index fefdc8b91b4a..8f1412e6c952 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -32,7 +32,7 @@ We used curriculum learning for pretraining, changing the data mix during traini
 More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
 
 
-This model was contributed by [abhi-db](<https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct).
+This model was contributed by [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct).
 
 ## DbrxConfig
 

From 5397d429e24e4c2a3b908cb1c592250c954e206e Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 4 Apr 2024 15:35:40 -0400
Subject: [PATCH 083/131] Update
 src/transformers/models/dbrx/configuration_dbrx.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/models/dbrx/configuration_dbrx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 86759f7ca030..2525d0408b8b 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From ea571d1d964e879954b651bba21c5333daf95eb2 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 4 Apr 2024 15:35:56 -0400
Subject: [PATCH 084/131] Update docs/source/en/model_doc/dbrx.md

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 docs/source/en/model_doc/dbrx.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index 8f1412e6c952..b5a9d8e350fd 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

From 2e12f341d72a5d7b300f27fcd461c86f89b62117 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Wed, 10 Apr 2024 17:53:16 +0000
Subject: [PATCH 085/131] fix typo

---
 src/transformers/models/dbrx/modeling_dbrx.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 8418783f218e..988f3aaf9321 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -35,14 +35,10 @@
 )
 from .configuration_dbrx import DbrxConfig
 
-
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (
-        index_first_axis,
-        pad_input,  # noqa
-        unpad_input,
-    )
+    from flash_attn.bert_padding import pad_input  # noqa
+    from flash_attn.bert_padding import index_first_axis, unpad_input
 
 logger = logging.get_logger(__name__)
 
@@ -252,7 +248,7 @@ def __init__(
         self.num_key_value_heads = attn_config.kv_n_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.rope_theta = attn_config.rope_theta
-        self.is_casual = True
+        self.is_causal = True
 
         self.Wqkv = nn.Linear(
             self.hidden_size, self.hidden_size + 2 * self.num_key_value_heads * self.head_dim, bias=False

From a5bebcb22853d4c639b7c6700ab3779608afa1e4 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <77638579+abhi-mosaic@users.noreply.github.com>
Date: Wed, 10 Apr 2024 14:25:39 -0700
Subject: [PATCH 086/131] Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/models/dbrx/modeling_dbrx.py | 14 +++++++-------
 tests/models/dbrx/test_modeling_dbrx.py       |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 988f3aaf9321..b37a138df81f 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 Databricks Mosaic Research and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -716,11 +716,11 @@ def __init__(
 
         self.layer = nn.Linear(self.hidden_size, self.moe_num_experts, bias=False)
 
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
         if self.training and self.moe_jitter_eps is not None:
-            x *= torch.empty_like(x).uniform_(1.0 - self.moe_jitter_eps, 1.0 + self.moe_jitter_eps)
-
-        weights = self.layer(x.view(-1, x.shape[-1])).softmax(dim=-1, dtype=torch.float32)
+            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.moe_jitter_eps, 1.0 + self.moe_jitter_eps)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        weights = self.layer(hidden_states).softmax(dim=-1, dtype=torch.float32)
         top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
 
         if self.moe_normalize_expert_weights:
@@ -795,8 +795,8 @@ def forward(
             if token_idx.shape[0] == 0:
                 continue
 
-            token_list = token_idx.tolist()
-            topk_list = topk_idx.tolist()
+            token_list = token_idx
+            topk_list = topk_idx
 
             expert_tokens = x[None, token_list].reshape(-1, hidden_size)
             expert_out = self.mlp(expert_tokens, expert_idx) * top_weights[token_list, topk_list, None]
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 6039f9f3a148..2a4bc5673f88 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 331db583ae783058337d282615abb4a3689c0686 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <abhi.venigalla@databricks.com>
Date: Wed, 10 Apr 2024 21:31:38 +0000
Subject: [PATCH 087/131] update docs, fix configuration_auto.py

---
 docs/source/en/model_doc/dbrx.md                   | 4 ++--
 src/transformers/models/auto/configuration_auto.py | 9 ++-------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index b5a9d8e350fd..1c3355c10501 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -20,7 +20,7 @@ It was pre-trained on 12T tokens of text and code data.
 Compared to other open MoE models like Mixtral-8x7B and Grok-1, DBRX is fine-grained, meaning it uses a larger number of smaller experts. DBRX has 16 experts and chooses 4, while Mixtral-8x7B and Grok-1 have 8 experts and choose 2.
 This provides 65x more possible combinations of experts and we found that this improves model quality.
 DBRX uses rotary position encodings (RoPE), gated linear units (GLU), and grouped query attention (GQA).
-It uses the GPT-4 tokenizer as described in the [tiktoken](https://github.com/openai/tiktoken) repository.
+It is a BPE based model and uses the GPT-4 tokenizer as described in the [tiktoken](https://github.com/openai/tiktoken) repository.
 We made these choices based on exhaustive evaluation and scaling experiments.
 
 DBRX was pretrained on 12T tokens of carefully curated data and a maximum context length of 32K tokens.
@@ -32,7 +32,7 @@ We used curriculum learning for pretraining, changing the data mix during traini
 More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
 
 
-This model was contributed by [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct).
+This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct).
 
 ## DbrxConfig
 
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c8fdbb74039a..5955c6528ce6 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -21,19 +21,14 @@
 from typing import List, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import (
-    get_class_from_dynamic_module,
-    resolve_trust_remote_code,
-)
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...utils import CONFIG_NAME, logging
 
 
 logger = logging.get_logger(__name__)
 
 
-from ..deprecated._archive_maps import (  # noqa: F401, E402
-    CONFIG_ARCHIVE_MAP_MAPPING_NAMES,
-)
+from ..deprecated._archive_maps import CONFIG_ARCHIVE_MAP_MAPPING_NAMES  # noqa: F401, E402
 
 
 CONFIG_MAPPING_NAMES = OrderedDict(

From ce758dfbb808ed1b2e23fd5c306f30c1dd853aa5 Mon Sep 17 00:00:00 2001
From: Abhi Venigalla <abhi.venigalla@databricks.com>
Date: Wed, 10 Apr 2024 22:13:01 +0000
Subject: [PATCH 088/131] address pr comments

---
 .../models/dbrx/configuration_dbrx.py         |  4 --
 src/transformers/models/dbrx/modeling_dbrx.py | 52 +++++++------------
 2 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index 2525d0408b8b..bbc39e3c354a 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -98,8 +98,6 @@ class DbrxFFNConfig(PretrainedConfig):
         moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
         moe_loss_weight (`float`, defaults to 0.01): The loss weight for the mixture of experts layer.
         moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
-        uniform_expert_assignment (`bool`, defaults to `False`): Whether to use uniform expert assignment.
-            This should only be used for benchmarking purposes.
     """
 
     def __init__(
@@ -111,7 +109,6 @@ def __init__(
         moe_jitter_eps: Optional[float] = None,
         moe_loss_weight: float = 0.01,
         moe_normalize_expert_weights: Optional[float] = 1.0,
-        uniform_expert_assignment: bool = False,
         **kwargs: Any,
     ):
         super().__init__()
@@ -124,7 +121,6 @@ def __init__(
         self.moe_jitter_eps = moe_jitter_eps
         self.moe_loss_weight = moe_loss_weight
         self.moe_normalize_expert_weights = moe_normalize_expert_weights
-        self.uniform_expert_assignment = uniform_expert_assignment
 
         for k in ["model_type"]:
             if k in kwargs:
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index b37a138df81f..6e4d8f4793fc 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -35,10 +35,14 @@
 )
 from .configuration_dbrx import DbrxConfig
 
+
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import pad_input  # noqa
-    from flash_attn.bert_padding import index_first_axis, unpad_input
+    from flash_attn.bert_padding import (
+        index_first_axis,
+        pad_input,  # noqa
+        unpad_input,
+    )
 
 logger = logging.get_logger(__name__)
 
@@ -274,8 +278,9 @@ def forward(
         bsz, q_len, _ = hidden_states.size()
 
         qkv_states = self.Wqkv(hidden_states)
-        if self.clip_qkv is not None:
-            qkv_states = qkv_states.clamp(min=-self.clip_qkv, max=self.clip_qkv)
+        min_val = -self.clip_qkv if self.clip_qkv is not None else None
+        max_val = self.clip_qkv
+        qkv_states = qkv_states.clamp(min=min_val, max=max_val)
 
         query_states, key_states, value_states = qkv_states.split(
             [
@@ -704,7 +709,6 @@ def __init__(
         moe_top_k: int,
         moe_jitter_eps: Optional[float],
         moe_normalize_expert_weights: Optional[float],
-        uniform_expert_assignment: bool,
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -712,33 +716,27 @@ def __init__(
         self.moe_top_k = moe_top_k
         self.moe_jitter_eps = moe_jitter_eps
         self.moe_normalize_expert_weights = moe_normalize_expert_weights
-        self.uniform_expert_assignment = uniform_expert_assignment
 
         self.layer = nn.Linear(self.hidden_size, self.moe_num_experts, bias=False)
 
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
         if self.training and self.moe_jitter_eps is not None:
-            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.moe_jitter_eps, 1.0 + self.moe_jitter_eps)
+            hidden_states *= torch.empty_like(hidden_states).uniform_(
+                1.0 - self.moe_jitter_eps, 1.0 + self.moe_jitter_eps
+            )
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
         weights = self.layer(hidden_states).softmax(dim=-1, dtype=torch.float32)
         top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1)
 
-        if self.moe_normalize_expert_weights:
-            top_weights = top_weights / torch.norm(
-                top_weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True
-            )
-
-        if self.uniform_expert_assignment:
-            with torch.no_grad():
-                uniform_tensor = (
-                    torch.arange(0, top_experts.numel(), device=top_experts.device, dtype=top_experts.dtype)
-                    % self.moe_num_experts
-                )
-                top_experts = uniform_tensor.reshape(top_experts.shape)
-                # Note, weights and top_weights are not changed
+        top_weights_scale = (
+            torch.norm(top_weights, p=self.moe_normalize_expert_weights, dim=-1, keepdim=True)
+            if self.moe_normalize_expert_weights is not None
+            else 1.0
+        )
+        top_weights = top_weights / top_weights_scale
 
-        weights = weights.to(x.dtype)
-        top_weights = top_weights.to(x.dtype)
+        weights = weights.to(hidden_states.dtype)
+        top_weights = top_weights.to(hidden_states.dtype)
         return weights, top_weights, top_experts  # type: ignore
 
 
@@ -818,7 +816,6 @@ def __init__(self, config: DbrxConfig):
             moe_top_k=ffn_config.moe_top_k,
             moe_jitter_eps=ffn_config.moe_jitter_eps,
             moe_normalize_expert_weights=ffn_config.moe_normalize_expert_weights,
-            uniform_expert_assignment=ffn_config.uniform_expert_assignment,
         )
 
         self.experts = DbrxExperts(
@@ -1003,14 +1000,6 @@ def get_input_embeddings(self) -> nn.Embedding:
     def set_input_embeddings(self, value: nn.Embedding):
         self.wte = value
 
-    def _autocast_input_embeddings(self, inputs_embeds: torch.Tensor) -> torch.Tensor:
-        if inputs_embeds.device.type == "cuda" and torch.is_autocast_enabled():
-            return inputs_embeds.to(dtype=torch.get_autocast_gpu_dtype())
-        elif inputs_embeds.device.type == "cpu" and torch.is_autocast_cpu_enabled():
-            return inputs_embeds.to(dtype=torch.get_autocast_cpu_dtype())
-        else:
-            return inputs_embeds
-
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1049,7 +1038,6 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
 
-        inputs_embeds = self._autocast_input_embeddings(inputs_embeds)  # type: ignore
         inputs_embeds = nn.functional.dropout(inputs_embeds, p=self.emb_pdrop, training=self.training)
 
         past_seen_tokens = 0

From 8df05d93b92146e88150cf6980104c2a4d4eec34 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 11 Apr 2024 12:49:11 +0000
Subject: [PATCH 089/131] remove is_decoder flag

---
 src/transformers/models/dbrx/configuration_dbrx.py | 4 ----
 src/transformers/models/dbrx/modeling_dbrx.py      | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index bbc39e3c354a..f97ae24c5a6f 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -184,8 +184,6 @@ class DbrxConfig(PretrainedConfig):
         output_router_logits (`bool`, *optional*, defaults to `False`):
             Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss. See [here]() for more details.
-        is_decoder (`bool`, defaults to `True`, *optional*, defaults to `True`):  Whether the model is used as decoder or not (in which case
-            it’s used as an encoder).
 
 
     Example:
@@ -225,7 +223,6 @@ def __init__(
         use_cache: bool = True,
         initializer_range: float = 0.02,
         output_router_logits: bool = False,
-        is_decoder: bool = True,
         **kwargs: Any,
     ):
         if attn_config is None:
@@ -259,6 +256,5 @@ def __init__(
 
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
-            is_decoder=is_decoder,
             **kwargs,
         )
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 6e4d8f4793fc..e13d7d09e6cd 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -229,12 +229,10 @@ def __init__(
         self,
         config: DbrxConfig,
         block_idx: Optional[int] = None,
-        is_decoder: bool = True,
     ):
         super().__init__()
         self.config = config
         self.hidden_size = config.d_model
-        self.is_decoder = is_decoder
         self.num_heads = config.n_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.max_position_embeddings = config.max_seq_len
@@ -663,7 +661,6 @@ def __init__(
         self.attn = DBRX_ATTENTION_CLASSES[config._attn_implementation](
             config=config,
             block_idx=block_idx,
-            is_decoder=config.is_decoder,
         )
         self.norm_2 = nn.LayerNorm(config.d_model, bias=False)
 

From c89f1a154d513aec7727dc9b4d705e3fa140d67b Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 11 Apr 2024 20:38:44 +0000
Subject: [PATCH 090/131] slice

---
 src/transformers/models/dbrx/modeling_dbrx.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index e13d7d09e6cd..9c7e373bff34 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -754,9 +754,14 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int,
         self.activation_fn = ACT2FN[act_fn_name]
 
     def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
-        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        # Slice in no grad context to avoid storing the entire param in backward pass
+        with torch.no_grad():
+            expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+            expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+            expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        expert_w1.requires_grad = True
+        expert_v1.requires_grad = True
+        expert_w2.requires_grad = True
 
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())

From dbd8b147b60c08a3911885b4b6836a5244389410 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 11 Apr 2024 20:42:55 +0000
Subject: [PATCH 091/131] fix requires grad

---
 src/transformers/models/dbrx/modeling_dbrx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 9c7e373bff34..e99dc4712606 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -759,9 +759,9 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
             expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
             expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
             expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        expert_w1.requires_grad = True
-        expert_v1.requires_grad = True
-        expert_w2.requires_grad = True
+        expert_w1.requires_grad = expert_w1.requires_grad
+        expert_v1.requires_grad = expert_v1.requires_grad
+        expert_w2.requires_grad = expert_w2.requires_grad
 
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())

From a7ee563164cb818b4c9d3ead059fc03b1d527258 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Apr 2024 13:09:00 +0000
Subject: [PATCH 092/131] remove grad

---
 src/transformers/models/dbrx/modeling_dbrx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index e99dc4712606..8773aa4598bb 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -759,9 +759,9 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
             expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
             expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
             expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        expert_w1.requires_grad = expert_w1.requires_grad
-        expert_v1.requires_grad = expert_v1.requires_grad
-        expert_w2.requires_grad = expert_w2.requires_grad
+        # expert_w1.requires_grad = expert_w1.requires_grad
+        # expert_v1.requires_grad = expert_v1.requires_grad
+        # expert_w2.requires_grad = expert_w2.requires_grad
 
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())

From 2e3bd86aa664a83c12841bbcc542f67484a22850 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Apr 2024 13:56:21 +0000
Subject: [PATCH 093/131] disconnect differently

---
 src/transformers/models/dbrx/modeling_dbrx.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 8773aa4598bb..cee00c5d7c22 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -755,13 +755,16 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int,
 
     def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
         # Slice in no grad context to avoid storing the entire param in backward pass
-        with torch.no_grad():
-            expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-            expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-            expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        # expert_w1.requires_grad = expert_w1.requires_grad
-        # expert_v1.requires_grad = expert_v1.requires_grad
-        # expert_w2.requires_grad = expert_w2.requires_grad
+        # with torch.no_grad():
+        #     expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        #     expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        #     expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
+        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
+        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
+        expert_w1.requires_grad = self.w1.requires_grad
+        expert_v1.requires_grad = self.v1.requires_grad
+        expert_w2.requires_grad = self.w2.requires_grad
 
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())

From 826947df1a24806ea11994412599903db1930284 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Apr 2024 16:46:07 +0000
Subject: [PATCH 094/131] remove grad

---
 src/transformers/models/dbrx/modeling_dbrx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index cee00c5d7c22..77b519c87968 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -762,9 +762,9 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
         expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
         expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
         expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
-        expert_w1.requires_grad = self.w1.requires_grad
-        expert_v1.requires_grad = self.v1.requires_grad
-        expert_w2.requires_grad = self.w2.requires_grad
+        # expert_w1.requires_grad = self.w1.requires_grad
+        # expert_v1.requires_grad = self.v1.requires_grad
+        # expert_w2.requires_grad = self.w2.requires_grad
 
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())

From 35aca3a69ad06a6c22b4e507efb70af02645478d Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 12 Apr 2024 17:00:25 +0000
Subject: [PATCH 095/131] enable grads

---
 src/transformers/models/dbrx/modeling_dbrx.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 77b519c87968..cee00c5d7c22 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -762,9 +762,9 @@ def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
         expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
         expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
         expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
-        # expert_w1.requires_grad = self.w1.requires_grad
-        # expert_v1.requires_grad = self.v1.requires_grad
-        # expert_w2.requires_grad = self.w2.requires_grad
+        expert_w1.requires_grad = self.w1.requires_grad
+        expert_v1.requires_grad = self.v1.requires_grad
+        expert_w2.requires_grad = self.w2.requires_grad
 
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())

From 7ffb9f85d82218fd312a68e3b539b30e06c8b9f2 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sat, 13 Apr 2024 00:13:05 +0000
Subject: [PATCH 096/131] patch

---
 src/transformers/models/dbrx/modeling_dbrx.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index cee00c5d7c22..b44d3463aca6 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -754,11 +754,7 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int,
         self.activation_fn = ACT2FN[act_fn_name]
 
     def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
-        # Slice in no grad context to avoid storing the entire param in backward pass
-        # with torch.no_grad():
-        #     expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        #     expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        #     expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        # Detach experts to avoid backpropagating through the expert selection
         expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
         expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
         expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()

From a8237bd35ab3e635ea3e814dcb869919cbb72e07 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sat, 13 Apr 2024 00:17:11 +0000
Subject: [PATCH 097/131] detach expert

---
 src/transformers/models/dbrx/modeling_dbrx.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index e13d7d09e6cd..b44d3463aca6 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -754,9 +754,13 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int,
         self.activation_fn = ACT2FN[act_fn_name]
 
     def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
-        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
-        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx]
+        # Detach experts to avoid backpropagating through the expert selection
+        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
+        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
+        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
+        expert_w1.requires_grad = self.w1.requires_grad
+        expert_v1.requires_grad = self.v1.requires_grad
+        expert_w2.requires_grad = self.w2.requires_grad
 
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())

From 99eba889bee2ee64d6c45be4af010636cc8a6c33 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Sun, 14 Apr 2024 21:21:40 +0000
Subject: [PATCH 098/131] nissan al ghaib

---
 src/transformers/models/dbrx/modeling_dbrx.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index b44d3463aca6..263bffb2521d 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -753,15 +753,7 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int,
             raise ValueError(f"FFN activation function has unhandled kwargs {ffn_act_fn=}")
         self.activation_fn = ACT2FN[act_fn_name]
 
-    def forward(self, x: torch.Tensor, expert_idx: int) -> torch.Tensor:
-        # Detach experts to avoid backpropagating through the expert selection
-        expert_w1 = self.w1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
-        expert_v1 = self.v1.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
-        expert_w2 = self.w2.view(self.moe_num_experts, self.ffn_hidden_size, self.hidden_size)[expert_idx].detach()
-        expert_w1.requires_grad = self.w1.requires_grad
-        expert_v1.requires_grad = self.v1.requires_grad
-        expert_w2.requires_grad = self.w2.requires_grad
-
+    def forward(self, x: torch.Tensor, expert_w1: torch.Tensor, expert_v1: torch.Tensor, expert_w2: torch.Tensor) -> torch.Tensor:
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())
         gate_proj = self.activation_fn(gate_proj)
@@ -789,6 +781,13 @@ def forward(
         out = torch.zeros_like(x)
 
         expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
+        # Chunk weights and experts at once to avoid storing full parameter multiple times in autograd
+        w1_chunked = self.mlp.w1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(self.moe_num_experts, dim=0)
+        v1_chunked = self.mlp.v1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(self.moe_num_experts, dim=0)
+        w2_chunked = self.mlp.w2.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(self.moe_num_experts, dim=0)
+        w1_chunked = [w1.squeeze(dim=0) for w1 in w1_chunked]
+        v1_chunked = [v1.squeeze(dim=0) for v1 in v1_chunked]
+        w2_chunked = [w2.squeeze(dim=0) for w2 in w2_chunked]
         for expert_idx in range(0, self.moe_num_experts):
             topk_idx, token_idx = torch.where(expert_mask[expert_idx])
             if token_idx.shape[0] == 0:
@@ -798,7 +797,7 @@ def forward(
             topk_list = topk_idx
 
             expert_tokens = x[None, token_list].reshape(-1, hidden_size)
-            expert_out = self.mlp(expert_tokens, expert_idx) * top_weights[token_list, topk_list, None]
+            expert_out = self.mlp(expert_tokens, w1_chunked[expert_idx], v1_chunked[expert_idx], w2_chunked[expert_idx]) * top_weights[token_list, topk_list, None]
 
             out.index_add_(0, token_idx, expert_out)
 

From ab9d85f5f726b4bd62c6acff2ea764d0c6c1f820 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 15 Apr 2024 07:23:27 -0700
Subject: [PATCH 099/131] Update modeling_dbrx.py

---
 src/transformers/models/dbrx/modeling_dbrx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 263bffb2521d..27488ba864a5 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -781,7 +781,7 @@ def forward(
         out = torch.zeros_like(x)
 
         expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
-        # Chunk weights and experts at once to avoid storing full parameter multiple times in autograd
+        # Chunk experts at once to avoid storing full parameter multiple times in autograd
         w1_chunked = self.mlp.w1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(self.moe_num_experts, dim=0)
         v1_chunked = self.mlp.v1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(self.moe_num_experts, dim=0)
         w2_chunked = self.mlp.w2.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(self.moe_num_experts, dim=0)

From 43976c0b786c81415a5377f2c3b2e62301637792 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Mon, 15 Apr 2024 12:56:48 -0400
Subject: [PATCH 100/131] Update src/transformers/models/dbrx/modeling_dbrx.py

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
---
 src/transformers/models/dbrx/modeling_dbrx.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 27488ba864a5..a94ad19615ca 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -879,10 +879,6 @@ def forward(
                 returned and can be used to speed up decoding (see `past_key_values`).
             cache_position (`torch.LongTensor`, optional): position ids of the cache
         """
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
 
         # Norm + Attention + Norm
         resid_states, hidden_states, self_attn_weights, present_key_value = self.norm_attn_norm(

From 2980330ca70b82bc3191ecc92c8a2626d8960a22 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 15 Apr 2024 16:59:30 +0000
Subject: [PATCH 101/131] replace "Gemma" with "Dbrx"

---
 src/transformers/models/dbrx/modeling_dbrx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index a94ad19615ca..7a380879612e 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -569,7 +569,7 @@ def forward(
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
-                "GemmaModel is using GemmaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                "DbrxModel is using DbrxSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
                 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(

From 8e2894262b9df54189dbb7c31a3e12b646983254 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 15 Apr 2024 17:21:42 +0000
Subject: [PATCH 102/131] remove # type: ignore

---
 src/transformers/models/dbrx/modeling_dbrx.py | 52 +++++++++++--------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 7a380879612e..b35ccabac6fd 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -15,7 +15,6 @@
 """ PyTorch DBRX model. """
 
 import math
-import warnings
 from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
@@ -447,7 +446,7 @@ def forward(
         if not output_attentions:
             attn_weights = None
 
-        return attn_output, attn_weights, past_key_value  # type: ignore
+        return attn_output, attn_weights, past_key_value
 
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
@@ -734,7 +733,7 @@ def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tens
 
         weights = weights.to(hidden_states.dtype)
         top_weights = top_weights.to(hidden_states.dtype)
-        return weights, top_weights, top_experts  # type: ignore
+        return weights, top_weights, top_experts
 
 
 class DbrxExpertGLU(nn.Module):
@@ -753,7 +752,9 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int,
             raise ValueError(f"FFN activation function has unhandled kwargs {ffn_act_fn=}")
         self.activation_fn = ACT2FN[act_fn_name]
 
-    def forward(self, x: torch.Tensor, expert_w1: torch.Tensor, expert_v1: torch.Tensor, expert_w2: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, x: torch.Tensor, expert_w1: torch.Tensor, expert_v1: torch.Tensor, expert_w2: torch.Tensor
+    ) -> torch.Tensor:
         gate_proj = x.matmul(expert_w1.t())
         up_proj = x.matmul(expert_v1.t())
         gate_proj = self.activation_fn(gate_proj)
@@ -782,9 +783,15 @@ def forward(
 
         expert_mask = nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
         # Chunk experts at once to avoid storing full parameter multiple times in autograd
-        w1_chunked = self.mlp.w1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(self.moe_num_experts, dim=0)
-        v1_chunked = self.mlp.v1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(self.moe_num_experts, dim=0)
-        w2_chunked = self.mlp.w2.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(self.moe_num_experts, dim=0)
+        w1_chunked = self.mlp.w1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(
+            self.moe_num_experts, dim=0
+        )
+        v1_chunked = self.mlp.v1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(
+            self.moe_num_experts, dim=0
+        )
+        w2_chunked = self.mlp.w2.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(
+            self.moe_num_experts, dim=0
+        )
         w1_chunked = [w1.squeeze(dim=0) for w1 in w1_chunked]
         v1_chunked = [v1.squeeze(dim=0) for v1 in v1_chunked]
         w2_chunked = [w2.squeeze(dim=0) for w2 in w2_chunked]
@@ -797,7 +804,10 @@ def forward(
             topk_list = topk_idx
 
             expert_tokens = x[None, token_list].reshape(-1, hidden_size)
-            expert_out = self.mlp(expert_tokens, w1_chunked[expert_idx], v1_chunked[expert_idx], w2_chunked[expert_idx]) * top_weights[token_list, topk_list, None]
+            expert_out = (
+                self.mlp(expert_tokens, w1_chunked[expert_idx], v1_chunked[expert_idx], w2_chunked[expert_idx])
+                * top_weights[token_list, topk_list, None]
+            )
 
             out.index_add_(0, token_idx, expert_out)
 
@@ -1040,20 +1050,18 @@ def forward(
         if use_cache:  # kept for BC (cache positions)
             if not isinstance(past_key_values, StaticCache):
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                past_seen_tokens = past_key_values.get_seq_length(  # type: ignore
-                )
+                past_seen_tokens = past_key_values.get_seq_length()
 
         if cache_position is None:
             if isinstance(past_key_values, StaticCache):
                 raise ValueError("cache_position is a required argument when using StaticCache.")
-            cache_position = torch.arange(  # type: ignore
+            cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
 
         if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)  # type: ignore
-
-        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)  # type: ignore
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
 
         # embed positions
         hidden_states = inputs_embeds
@@ -1066,7 +1074,7 @@ def forward(
 
         for block in self.blocks:
             if output_hidden_states:
-                all_hidden_states += (hidden_states,)  # type: ignore
+                all_hidden_states += (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
                 block_outputs = self._gradient_checkpointing_func(
@@ -1098,23 +1106,21 @@ def forward(
                 next_decoder_cache = block_outputs[2 if output_attentions else 1]
 
             if output_attentions:
-                all_self_attns += (block_outputs[1],)  # type: ignore
+                all_self_attns += (block_outputs[1],)
 
             if output_router_logits:
-                all_router_logits += (block_outputs[-1],)  # type: ignore
+                all_router_logits += (block_outputs[-1],)
 
         hidden_states = self.norm_f(hidden_states)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
-            all_hidden_states += (hidden_states,)  # type: ignore
+            all_hidden_states += (hidden_states,)
 
         next_cache = None
         if use_cache:
             next_cache = (
-                next_decoder_cache.to_legacy_cache()  # type: ignore
-                if isinstance(next_decoder_cache, Cache)
-                else next_decoder_cache
+                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
             )
         if not return_dict:
             return tuple(
@@ -1185,7 +1191,7 @@ def _update_causal_mask(
             # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
             is_tracing = (
                 torch.jit.is_tracing()
-                or isinstance(input_tensor, torch.fx.Proxy)  # type: ignore
+                or isinstance(input_tensor, torch.fx.Proxy)
                 or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
             )
             if not is_tracing and torch.any(attention_mask != 1):
@@ -1398,7 +1404,7 @@ def prepare_inputs_for_generation(
             model_inputs = {"input_ids": input_ids.contiguous()}
 
         model_inputs.update(
-            {  # type: ignore
+            {
                 "position_ids": position_ids,
                 "cache_position": cache_position,
                 "past_key_values": past_key_values,

From b265e23361a71ecac561ca00afc995f178e427e5 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 15 Apr 2024 19:17:48 +0000
Subject: [PATCH 103/131] don't hardcode vocab_size

---
 tests/models/dbrx/test_modeling_dbrx.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 2a4bc5673f88..77f5ca45ce8e 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -368,9 +368,7 @@ def test_inference_masked_lm(self):
         model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
-
-        # TODO Replace vocab size
-        vocab_size = 32000
+        vocab_size = model.vocab_size
 
         expected_shape = torch.Size((1, 6, vocab_size))
         self.assertEqual(output.shape, expected_shape)

From 2ab56d3b386a75f86e1f0318b978cf70a1a0625b Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Mon, 15 Apr 2024 21:38:05 +0000
Subject: [PATCH 104/131] remove ToDo

---
 src/transformers/models/dbrx/modeling_dbrx.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index b35ccabac6fd..e7faab9295aa 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -950,9 +950,7 @@ def _init_weights(self, module: nn.Module):
             module.v1.data.normal_(mean=0.0, std=std)
             module.w2.data.normal_(mean=0.0, std=std)
 
-    def _setup_cache(
-        self, cache_cls: Any, max_batch_size: int, max_cache_len: int
-    ):  # TODO: how to set var type of class?
+    def _setup_cache(self, cache_cls: Any, max_batch_size: int, max_cache_len: int):
         if self.config._attn_implementation == "flash_attention_2" and cache_cls == StaticCache:
             raise ValueError(
                 "`static` cache implementation is not compatible with "

From dc30f2c14658747f6185e475b95952edb27d12ea Mon Sep 17 00:00:00 2001
From: Matt <rocketknight1@gmail.com>
Date: Tue, 16 Apr 2024 14:25:47 +0100
Subject: [PATCH 105/131] Re-add removed idefics2 line

---
 src/transformers/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c744cfe35e8e..8f340c50bb15 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -5433,6 +5433,7 @@
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
     from .models.idefics import IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP, IdeficsConfig
+    from .models.idefics2 import Idefics2Config
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
     from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
     from .models.instructblip import (

From 3771843efc5d741fc351036a199ffbf4f5160b20 Mon Sep 17 00:00:00 2001
From: Matt <rocketknight1@gmail.com>
Date: Wed, 17 Apr 2024 13:32:44 +0100
Subject: [PATCH 106/131] Update test to use tiny-random!

---
 tests/models/dbrx/test_modeling_dbrx.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 77f5ca45ce8e..1b206715ba9e 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -364,8 +364,8 @@ def test_new_cache_format(self, num_beams, do_sample):
 @require_torch
 class DbrxModelIntegrationTest(unittest.TestCase):
     @slow
-    def test_inference_masked_lm(self):
-        model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
+    def test_tiny_model_logits(self):
+        model = DbrxForCausalLM.from_pretrained("Rocketknight1/dbrx-tiny-random")
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
         vocab_size = model.vocab_size
@@ -375,7 +375,12 @@ def test_inference_masked_lm(self):
 
         # TODO Replace values below with what was printed above.
         expected_slice = torch.tensor(
-            [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
+            [
+                [
+                    [-1.6300e-04, 5.0118e-04, 2.5437e-04],
+                    [2.0422e-05, 2.7210e-04, -1.5125e-04],
+                    [-1.5105e-04, 4.6879e-04, 3.3309e-04],
+                ]
+            ]
         )
-
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))

From 661bf9dc61208528a46d846c5359ed26c6b9dfe7 Mon Sep 17 00:00:00 2001
From: Matt <rocketknight1@gmail.com>
Date: Wed, 17 Apr 2024 13:34:05 +0100
Subject: [PATCH 107/131] Remove TODO

---
 tests/models/dbrx/test_modeling_dbrx.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 1b206715ba9e..ab8655b1262d 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -373,7 +373,6 @@ def test_tiny_model_logits(self):
         expected_shape = torch.Size((1, 6, vocab_size))
         self.assertEqual(output.shape, expected_shape)
 
-        # TODO Replace values below with what was printed above.
         expected_slice = torch.tensor(
             [
                 [

From 8dd5de738d3a5343b2700bf9ab7e79be800c6dae Mon Sep 17 00:00:00 2001
From: Matt <rocketknight1@gmail.com>
Date: Wed, 17 Apr 2024 13:38:40 +0100
Subject: [PATCH 108/131] Remove one more case of loading the entire
 dbrx-instruct in the tests

---
 tests/models/dbrx/test_modeling_dbrx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index ab8655b1262d..478b143786b9 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -347,7 +347,7 @@ def test_model_various_embeddings(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model_name = "databricks/dbrx-instruct"
+        model_name = "Rocketknight1/dbrx-tiny-random"
         model = DbrxModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 

From 12cd8c8f2c6622105baa9184e23f05310d19ab38 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Wed, 17 Apr 2024 20:07:27 -0400
Subject: [PATCH 109/131] Update src/transformers/models/dbrx/modeling_dbrx.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/dbrx/modeling_dbrx.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index e7faab9295aa..0ebd88f9b63e 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -648,11 +648,7 @@ def forward(
 
 
 class DbrxNormAttentionNorm(nn.Module):
-    def __init__(
-        self,
-        config: DbrxConfig,
-        block_idx: Optional[int] = None,
-    ):
+    def __init__(self, config: DbrxConfig, block_idx: Optional[int] = None):
         super().__init__()
         self.block_idx = block_idx
         self.resid_pdrop = config.resid_pdrop

From 834801f299e4fe788f2ceee2cbe3c7aa5e8bd055 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 00:25:40 +0000
Subject: [PATCH 110/131] address some comments

---
 README.md                                     |  2 +-
 src/transformers/models/dbrx/modeling_dbrx.py | 14 +++-----------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index ecd71de6a254..87d4ada7f36c 100644
--- a/README.md
+++ b/README.md
@@ -341,7 +341,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 0ebd88f9b63e..395f772a90a7 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -37,11 +37,7 @@
 
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import (
-        index_first_axis,
-        pad_input,  # noqa
-        unpad_input,
-    )
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 
 logger = logging.get_logger(__name__)
 
@@ -224,11 +220,7 @@ def _get_unpad_data(attention_mask):
 class DbrxAttention(nn.Module):
     """Multi-head self attention."""
 
-    def __init__(
-        self,
-        config: DbrxConfig,
-        block_idx: Optional[int] = None,
-    ):
+    def __init__(self, config: DbrxConfig, block_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.hidden_size = config.d_model
@@ -1250,7 +1242,7 @@ def forward(
         >> from transformers import AutoTokenizer, DbrxForCausalLM
 
         >> model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct")
-        >> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", trust_remote_code=True)
+        >> tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct")
 
         >> prompt = "Hey, are you conscious? Can you talk to me?"
         >> inputs = tokenizer(prompt, return_tensors="pt")

From e281916c4f529fab6735fa157ee7010ef59fff1e Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 01:33:42 +0000
Subject: [PATCH 111/131] small model

---
 tests/models/dbrx/test_modeling_dbrx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 478b143786b9..352b0ad7ce48 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -347,7 +347,7 @@ def test_model_various_embeddings(self):
 
     @slow
     def test_model_from_pretrained(self):
-        model_name = "Rocketknight1/dbrx-tiny-random"
+        model_name = "eitanturok/dbrx-tiny"
         model = DbrxModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 

From 63b3db8f13ff516cb46921f566f5d1d8660a83f2 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 01:41:32 +0000
Subject: [PATCH 112/131] add dbrx to tokenization_auto

---
 src/transformers/models/auto/tokenization_auto.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 114521e91a32..413bfe001446 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -22,7 +22,10 @@
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...dynamic_module_utils import (
+    get_class_from_dynamic_module,
+    resolve_trust_remote_code,
+)
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
 from ...utils import (
@@ -150,6 +153,7 @@
             ("ctrl", ("CTRLTokenizer", None)),
             ("data2vec-audio", ("Wav2Vec2CTCTokenizer", None)),
             ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+            ("dbrx", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
             ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "deberta-v2",

From 236d815edbae3b0ef241f4cb528257810a25c9e8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 02:44:46 +0000
Subject: [PATCH 113/131] More docstrings with add_start_docstrings

---
 src/transformers/models/dbrx/modeling_dbrx.py | 122 +++++++++++++++++-
 1 file changed, 119 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 395f772a90a7..6bcdee52a258 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -28,9 +28,12 @@
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
+    replace_return_docstrings,
 )
 from .configuration_dbrx import DbrxConfig
 
@@ -909,6 +912,27 @@ def forward(
         return outputs
 
 
+DBRX_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`DbrxConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare DBRX Model outputting raw hidden-states without any specific head on top.",
+    DBRX_START_DOCSTRING,
+)
 class DbrxPreTrainedModel(PreTrainedModel):
     config_class = DbrxConfig
     base_model_prefix = "transformer"
@@ -961,10 +985,89 @@ def _reset_cache(self):
             block.norm_attn_norm.attn.past_key_value = None
 
 
+DBRX_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_router_logits (`bool`, *optional*):
+            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+            should not be returned during inference.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare DBRX Model outputting raw hidden-states without any specific head on top.",
+    DBRX_START_DOCSTRING,
+)
 class DbrxModel(DbrxPreTrainedModel):
-    """Transformer decoder consisting of *config.num_hidden_layers*
-
-    [`DbrxBlock`] layers.
+    """Transformer decoder consisting of *config.num_hidden_layers*. Each layer is a [`DbrxBlock`] layer.
 
     Args:
         config ([`DbrxConfig`]): Model configuration class with all parameters of the model.
@@ -992,6 +1095,7 @@ def get_input_embeddings(self) -> nn.Embedding:
     def set_input_embeddings(self, value: nn.Embedding):
         self.wte = value
 
+    @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1189,6 +1293,7 @@ def _update_causal_mask(
         return causal_mask
 
 
+@add_start_docstrings("The DBRX Model transformer for causal language modeling.", DBRX_START_DOCSTRING)
 class DbrxForCausalLM(DbrxPreTrainedModel):
     def __init__(self, config: DbrxConfig):
         super().__init__(config)
@@ -1220,6 +1325,8 @@ def set_decoder(self, decoder: DbrxModel):
     def get_decoder(self) -> DbrxModel:
         return self.transformer
 
+    @add_start_docstrings_to_model_forward(DBRX_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1237,7 +1344,16 @@ def forward(
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""Forward function for causal language modeling.
 
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
         Example:
+
         ```python
         >> from transformers import AutoTokenizer, DbrxForCausalLM
 

From 2dc54457a99a88a3dafdc28a661555d7cf46ea21 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 02:45:46 +0000
Subject: [PATCH 114/131] Dbrx for now

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 87d4ada7f36c..ecd71de6a254 100644
--- a/README.md
+++ b/README.md
@@ -341,7 +341,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.

From 745dc4720c344e5f5900e540e6c3ba3c41c1d77e Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 02:51:21 +0000
Subject: [PATCH 115/131] add PipelineTesterMixin

---
 tests/models/dbrx/test_modeling_dbrx.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 352b0ad7ce48..a66bf2acfc28 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -25,6 +25,7 @@
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -322,9 +323,10 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DbrxModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class DbrxModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (DbrxModel, DbrxForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (DbrxForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = {"text-generation": DbrxForCausalLM} if is_torch_available() else {}
     test_headmasking = False
     test_pruning = False
 

From d115cb426bf817b75cc82eb8663b9214e2d712eb Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Wed, 17 Apr 2024 22:56:37 -0400
Subject: [PATCH 116/131] Update
 src/transformers/models/dbrx/configuration_dbrx.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/dbrx/configuration_dbrx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index f97ae24c5a6f..e788e1082622 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -252,7 +252,7 @@ def __init__(
 
         tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
         if tie_word_embeddings:
-            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
+            raise ValueError("tie_word_embeddings is not supported for DBRX models.")
 
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,

From 15fb1eb4ba72d52f88fb1663b86a7b8f5e5712c6 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 03:05:18 +0000
Subject: [PATCH 117/131] remove flash-attn2 import error

---
 src/transformers/models/dbrx/modeling_dbrx.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 6bcdee52a258..305ee8fcbc82 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -335,9 +335,6 @@ class DbrxFlashAttention2(DbrxAttention):
     """
 
     def __init__(self, *args: Any, **kwargs: Any):
-        if not is_flash_attn_2_available():
-            raise ImportError("Flash Attention 2 is not available. Please install it with `pip install flash-attn`.")
-
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.

From 29c3e4def937d0d9048e332a135be4e68a6645cf Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Wed, 17 Apr 2024 23:19:04 -0400
Subject: [PATCH 118/131] fix docstring

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/dbrx/configuration_dbrx.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index e788e1082622..f47275b9314d 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -35,8 +35,8 @@ class DbrxAttentionConfig(PretrainedConfig):
     Args:
         attn_pdrop (`float`, *optional*, defaults to 0.0):
             The dropout probability for the attention layers.
-        clip_qkv (`float`, *optional*, defualts to `None`):
-            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        clip_qkv (`float`, *optional*):
+            If set, clip the queries, keys, and values in the attention layer to this value.
         kv_n_heads (`Optional[int]`, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
         rope_theta (`float`, defaults to 10000.0): The base frequency for rope.
     """

From 960819785edee38ff39c7468655a0ea8820abda0 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 03:27:38 +0000
Subject: [PATCH 119/131] add useage example

---
 docs/source/en/model_doc/dbrx.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index 1c3355c10501..1192c22021f2 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -34,6 +34,25 @@ More detailed information about DBRX Instruct and DBRX Base can be found in our
 
 This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct).
 
+## Usage Examples
+
+The `generate()` method can be used to generate text using DBRX.
+
+```python
+from transformers import DbrxForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOUR_HF_TOKEN")
+model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True, token="YOUR_HF_TOKEN")
+
+input_text = "What does it take to build a great LLM?"
+messages = [{"role": "user", "content": input_text}]
+input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=200)
+print(tokenizer.decode(outputs[0]))
+```
+
 ## DbrxConfig
 
 [[autodoc]] DbrxConfig

From 93920d02740813f02d549b0337d26a4d4a724d00 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Wed, 17 Apr 2024 23:46:02 -0400
Subject: [PATCH 120/131] put on one line

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/dbrx/configuration_dbrx.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index f47275b9314d..b03d2c17b09e 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -254,7 +254,4 @@ def __init__(
         if tie_word_embeddings:
             raise ValueError("tie_word_embeddings is not supported for DBRX models.")
 
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

From cad0b9d482357ec57ca548c7481be3fae7137e70 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 18 Apr 2024 07:26:16 -0400
Subject: [PATCH 121/131] fix ffn_act_fn

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/models/dbrx/modeling_dbrx.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 305ee8fcbc82..ab5da32c7028 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -735,9 +735,7 @@ def __init__(self, hidden_size: int, ffn_hidden_size: int, moe_num_experts: int,
         self.v1 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
         self.w2 = nn.Parameter(torch.empty(moe_num_experts * ffn_hidden_size, hidden_size))
 
-        act_fn_name = ffn_act_fn.pop("name", "silu")
-        if len(ffn_act_fn) != 0:
-            raise ValueError(f"FFN activation function has unhandled kwargs {ffn_act_fn=}")
+        act_fn_name = ffn_act_fn.get("name", "silu")
         self.activation_fn = ACT2FN[act_fn_name]
 
     def forward(

From 49bcacc44c1215eb890339793e28cbf477421190 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 11:43:28 +0000
Subject: [PATCH 122/131] change "dbrx" to "DBRX" for display purposes.

---
 README.md                                          |  1 +
 README_de.md                                       |  1 +
 README_es.md                                       |  1 +
 README_fr.md                                       |  1 +
 README_hd.md                                       |  1 +
 README_ja.md                                       |  1 +
 README_ko.md                                       |  1 +
 README_pt-br.md                                    |  1 +
 README_ru.md                                       |  1 +
 README_te.md                                       |  1 +
 README_vi.md                                       |  1 +
 README_zh-hans.md                                  |  1 +
 README_zh-hant.md                                  |  1 +
 docs/source/en/index.md                            |  2 +-
 docs/source/en/tasks/language_modeling.md          |  2 +-
 src/transformers/models/auto/configuration_auto.py | 11 ++++++++---
 16 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ecd71de6a254..55290b6200f2 100644
--- a/README.md
+++ b/README.md
@@ -342,6 +342,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_de.md b/README_de.md
index 5b8100132626..95b4cd01162a 100644
--- a/README_de.md
+++ b/README_de.md
@@ -338,6 +338,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_es.md b/README_es.md
index 1673acef7cbe..270ccb9a25a6 100644
--- a/README_es.md
+++ b/README_es.md
@@ -315,6 +315,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_fr.md b/README_fr.md
index 38ad7277049b..d25533e0c57a 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -336,6 +336,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (de Microsoft) publié dans l'article [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) par Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (de Facebook) publié dans l'article [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) par Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (de Berkeley/Facebook/Google) publié dans l'article [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) par Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_hd.md b/README_hd.md
index 7b505d56b99a..a9da3cc259ae 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -289,6 +289,7 @@ conda install conda-forge::transformers
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft से) साथ में दिया गया पेपर [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) हैपिंग वू, बिन जिओ, नोएल कोडेला, मेंगचेन लियू, जियांग दाई, लू युआन, लेई झांग द्वारा।
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (फेसबुक से) साथ में कागज [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) एलेक्सी बाएव्स्की, वेई-निंग सू, कियानटोंग जू, अरुण बाबू, जियाताओ गु, माइकल औली द्वारा पोस्ट किया गया।
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft से) साथ में दिया गया पेपर [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) पेंगचेंग हे, ज़ियाओडोंग लियू, जियानफेंग गाओ, वीज़ू चेन द्वारा।
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft से) साथ में दिया गया पेपर [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) पेंगचेंग हे, ज़ियाओडोंग लियू, जियानफेंग गाओ, वीज़ू चेन द्वारा पोस्ट किया गया।
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (बर्कले/फेसबुक/गूगल से) पेपर के साथ [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) लिली चेन, केविन लू, अरविंद राजेश्वरन, किमिन ली, आदित्य ग्रोवर, माइकल लास्किन, पीटर एबील, अरविंद श्रीनिवास, इगोर मोर्डच द्वारा पोस्ट किया गया।
diff --git a/README_ja.md b/README_ja.md
index 7235d883831b..0a73ccd406b9 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -349,6 +349,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google から) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch から公開された研究論文: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
diff --git a/README_ko.md b/README_ko.md
index 1e3ec0352770..156ec78d2677 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -264,6 +264,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft 에서) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 의 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 논문과 함께 발표했습니다.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook 에서) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 의 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 논문과 함께 발표했습니다.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google 에서) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 의 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 논문과 함께 발표했습니다.
diff --git a/README_pt-br.md b/README_pt-br.md
index 22ae51edc088..e0e27e7b0113 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -347,6 +347,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_ru.md b/README_ru.md
index 5ab283ff1477..2d638ade7601 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -337,6 +337,7 @@ conda install conda-forge::transformers
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_te.md b/README_te.md
index 26c9b4127f4b..4e6079acaa31 100644
--- a/README_te.md
+++ b/README_te.md
@@ -339,6 +339,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_vi.md b/README_vi.md
index f40fa691a683..552c64673c93 100644
--- a/README_vi.md
+++ b/README_vi.md
@@ -338,6 +338,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (từ Microsoft) được phát hành với bài báo [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (từ Facebook) được phát hành với bài báo [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (từ Microsoft) được phát hành với bài báo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (từ Microsoft) được phát hành với bài báo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (từ Berkeley/Facebook/Google) được phát hành với bài báo [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index a9547af61133..9f9984584f18 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -288,6 +288,7 @@ conda install conda-forge::transformers
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (来自 Berkeley/Facebook/Google) 伴随论文 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 由 Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 7619075d3b65..c43a80c71395 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -300,6 +300,7 @@ conda install conda-forge::transformers
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index ac3fa31827be..ac901c22a82f 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -107,7 +107,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
 |                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
 |                   [Data2VecVision](model_doc/data2vec)                   |       ✅        |         ✅         |      ❌      |
-|                          [Dbrx](model_doc/dbrx)                          |       ✅        |         ❌         |      ❌      |
+|                          [DBRX](model_doc/dbrx)                          |       ✅        |         ❌         |      ❌      |
 |                       [DeBERTa](model_doc/deberta)                       |       ✅        |         ✅         |      ❌      |
 |                    [DeBERTa-v2](model_doc/deberta-v2)                    |       ✅        |         ✅         |      ❌      |
 |          [Decision Transformer](model_doc/decision_transformer)          |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index 64cf1f792957..38a32c81703d 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -37,7 +37,7 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [Cohere](../model_doc/cohere), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [Dbrx](../model_doc/dbrx), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [Gemma](../model_doc/gemma), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Mamba](../model_doc/mamba), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MusicGen Melody](../model_doc/musicgen_melody), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Qwen2](../model_doc/qwen2), [Qwen2MoE](../model_doc/qwen2_moe), [RecurrentGemma](../model_doc/recurrent_gemma), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [StableLm](../model_doc/stablelm), [Starcoder2](../model_doc/starcoder2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [Cohere](../model_doc/cohere), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DBRX](../model_doc/dbrx), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [Gemma](../model_doc/gemma), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Mamba](../model_doc/mamba), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MusicGen Melody](../model_doc/musicgen_melody), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Qwen2](../model_doc/qwen2), [Qwen2MoE](../model_doc/qwen2_moe), [RecurrentGemma](../model_doc/recurrent_gemma), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [StableLm](../model_doc/stablelm), [Starcoder2](../model_doc/starcoder2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
 
 
 
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2e2d3babdbd8..7c36a8b10880 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -21,14 +21,19 @@
 from typing import List, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
+from ...dynamic_module_utils import (
+    get_class_from_dynamic_module,
+    resolve_trust_remote_code,
+)
 from ...utils import CONFIG_NAME, logging
 
 
 logger = logging.get_logger(__name__)
 
 
-from ..deprecated._archive_maps import CONFIG_ARCHIVE_MAP_MAPPING_NAMES  # noqa: F401, E402
+from ..deprecated._archive_maps import (  # noqa: F401, E402
+    CONFIG_ARCHIVE_MAP_MAPPING_NAMES,
+)
 
 
 CONFIG_MAPPING_NAMES = OrderedDict(
@@ -339,7 +344,7 @@
         ("data2vec-audio", "Data2VecAudio"),
         ("data2vec-text", "Data2VecText"),
         ("data2vec-vision", "Data2VecVision"),
-        ("dbrx", "Dbrx"),
+        ("dbrx", "DBRX"),
         ("deberta", "DeBERTa"),
         ("deberta-v2", "DeBERTa-v2"),
         ("decision_transformer", "Decision Transformer"),

From 9e268503d133a2394d715dd3c4259701b8e74cec Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 18 Apr 2024 07:49:54 -0400
Subject: [PATCH 123/131] fix __init__.py?

---
 src/transformers/__init__.py | 78 +++++++++++++++---------------------
 1 file changed, 32 insertions(+), 46 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8f340c50bb15..b0b0100a8792 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -5303,10 +5303,7 @@
         TransfoXLTokenizer,
     )
     from .models.deprecated.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
-    from .models.depth_anything import (
-        DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        DepthAnythingConfig,
-    )
+    from .models.depth_anything import DEPTH_ANYTHING_PRETRAINED_CONFIG_ARCHIVE_MAP, DepthAnythingConfig
     from .models.deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig
@@ -5362,11 +5359,7 @@
         FastSpeech2ConformerTokenizer,
         FastSpeech2ConformerWithHifiGanConfig,
     )
-    from .models.flaubert import (
-        FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        FlaubertConfig,
-        FlaubertTokenizer,
-    )
+    from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
     from .models.flava import (
         FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
         FlavaConfig,
@@ -5432,7 +5425,10 @@
     from .models.herbert import HerbertTokenizer
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
-    from .models.idefics import IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP, IdeficsConfig
+    from .models.idefics import (
+        IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        IdeficsConfig,
+    )
     from .models.idefics2 import Idefics2Config
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
     from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig
@@ -5662,11 +5658,7 @@
     from .models.pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig
     from .models.pvt_v2 import PvtV2Config
     from .models.qdqbert import QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, QDQBertConfig
-    from .models.qwen2 import (
-        QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        Qwen2Config,
-        Qwen2Tokenizer,
-    )
+    from .models.qwen2 import QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP, Qwen2Config, Qwen2Tokenizer
     from .models.qwen2_moe import QWEN2MOE_PRETRAINED_CONFIG_ARCHIVE_MAP, Qwen2MoeConfig
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
     from .models.realm import (
@@ -5717,10 +5709,7 @@
         SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         SeamlessM4Tv2Config,
     )
-    from .models.segformer import (
-        SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        SegformerConfig,
-    )
+    from .models.segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig
     from .models.seggpt import SEGGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, SegGptConfig
     from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
     from .models.sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig
@@ -5763,14 +5752,8 @@
         SqueezeBertTokenizer,
     )
     from .models.stablelm import STABLELM_PRETRAINED_CONFIG_ARCHIVE_MAP, StableLmConfig
-    from .models.starcoder2 import (
-        STARCODER2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        Starcoder2Config,
-    )
-    from .models.superpoint import (
-        SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        SuperPointConfig,
-    )
+    from .models.starcoder2 import STARCODER2_PRETRAINED_CONFIG_ARCHIVE_MAP, Starcoder2Config
+    from .models.superpoint import SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP, SuperPointConfig
     from .models.swiftformer import (
         SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         SwiftFormerConfig,
@@ -5812,12 +5795,12 @@
         TvltFeatureExtractor,
         TvltProcessor,
     )
-    from .models.tvp import TVP_PRETRAINED_CONFIG_ARCHIVE_MAP, TvpConfig, TvpProcessor
-    from .models.udop import (
-        UDOP_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        UdopConfig,
-        UdopProcessor,
+    from .models.tvp import (
+        TVP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TvpConfig,
+        TvpProcessor,
     )
+    from .models.udop import UDOP_PRETRAINED_CONFIG_ARCHIVE_MAP, UdopConfig, UdopProcessor
     from .models.umt5 import UMT5Config
     from .models.unispeech import (
         UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -5841,7 +5824,10 @@
         ViltImageProcessor,
         ViltProcessor,
     )
-    from .models.vipllava import VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, VipLlavaConfig
+    from .models.vipllava import (
+        VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        VipLlavaConfig,
+    )
     from .models.vision_encoder_decoder import VisionEncoderDecoderConfig
     from .models.vision_text_dual_encoder import (
         VisionTextDualEncoderConfig,
@@ -6056,13 +6042,7 @@
     )
 
     # bitsandbytes config
-    from .utils.quantization_config import (
-        AqlmConfig,
-        AwqConfig,
-        BitsAndBytesConfig,
-        GPTQConfig,
-        QuantoConfig,
-    )
+    from .utils.quantization_config import AqlmConfig, AwqConfig, BitsAndBytesConfig, GPTQConfig, QuantoConfig
 
     try:
         if not is_sentencepiece_available():
@@ -6709,7 +6689,11 @@
             CodeGenModel,
             CodeGenPreTrainedModel,
         )
-        from .models.cohere import CohereForCausalLM, CohereModel, CoherePreTrainedModel
+        from .models.cohere import (
+            CohereForCausalLM,
+            CohereModel,
+            CoherePreTrainedModel,
+        )
         from .models.conditional_detr import (
             CONDITIONAL_DETR_PRETRAINED_MODEL_ARCHIVE_LIST,
             ConditionalDetrForObjectDetection,
@@ -7081,7 +7065,10 @@
             FunnelPreTrainedModel,
             load_tf_weights_in_funnel,
         )
-        from .models.fuyu import FuyuForCausalLM, FuyuPreTrainedModel
+        from .models.fuyu import (
+            FuyuForCausalLM,
+            FuyuPreTrainedModel,
+        )
         from .models.gemma import (
             GemmaForCausalLM,
             GemmaForSequenceClassification,
@@ -7866,6 +7853,8 @@
             SamModel,
             SamPreTrainedModel,
         )
+
+        # PyTorch model imports
         from .models.seamless_m4t import (
             SEAMLESS_M4T_PRETRAINED_MODEL_ARCHIVE_LIST,
             SeamlessM4TCodeHifiGan,
@@ -9001,10 +8990,7 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_torchaudio_objects import *
     else:
-        from .models.musicgen_melody import (
-            MusicgenMelodyFeatureExtractor,
-            MusicgenMelodyProcessor,
-        )
+        from .models.musicgen_melody import MusicgenMelodyFeatureExtractor, MusicgenMelodyProcessor
     try:
         if not is_flax_available():
             raise OptionalDependencyNotAvailable()

From d714986d8aed11fa09dd257297d637943ed69c14 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 18 Apr 2024 07:51:54 -0400
Subject: [PATCH 124/131] fix __init__.py

---
 src/transformers/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b0b0100a8792..f52b76a07e61 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1473,7 +1473,6 @@
             "AlignVisionModel",
         ]
     )
-
     _import_structure["models.altclip"].extend(
         [
             "ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4645,7 +4644,9 @@
     if not is_torchaudio_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils import dummy_torchaudio_objects
+    from .utils import (
+        dummy_torchaudio_objects,	
+    )
 
     _import_structure["utils.dummy_torchaudio_objects"] = [
         name for name in dir(dummy_torchaudio_objects) if not name.startswith("_")

From cac26a1693f48a88c3bee882b05b67a99bcf6a31 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 11:54:18 +0000
Subject: [PATCH 125/131] fix README

---
 README.md         | 4 ++--
 README_de.md      | 4 ++--
 README_es.md      | 4 ++--
 README_fr.md      | 4 ++--
 README_hd.md      | 4 ++--
 README_ja.md      | 4 ++--
 README_ko.md      | 4 ++--
 README_pt-br.md   | 4 ++--
 README_ru.md      | 4 ++--
 README_te.md      | 4 ++--
 README_vi.md      | 4 ++--
 README_zh-hans.md | 4 ++--
 README_zh-hant.md | 4 ++--
 13 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 55290b6200f2..901b56347446 100644
--- a/README.md
+++ b/README.md
@@ -341,8 +341,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_de.md b/README_de.md
index 95b4cd01162a..cacd7868bc31 100644
--- a/README_de.md
+++ b/README_de.md
@@ -337,8 +337,8 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_es.md b/README_es.md
index 270ccb9a25a6..ad9122748ba8 100644
--- a/README_es.md
+++ b/README_es.md
@@ -314,8 +314,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_fr.md b/README_fr.md
index d25533e0c57a..e00845d9ee49 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -335,8 +335,8 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (de Salesforce) publié dans l'article [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) par Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong et Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (de Microsoft) publié dans l'article [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) par Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (de Facebook) publié dans l'article [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) par Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (de Berkeley/Facebook/Google) publié dans l'article [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) par Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_hd.md b/README_hd.md
index a9da3cc259ae..8a48828de3a4 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -288,8 +288,8 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (सेल्सफोर्स से) साथ में पेपर [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) नीतीश शिरीष केसकर*, ब्रायन मैककैन*, लव आर. वार्ष्णेय, कैमिंग जिओंग और रिचर्ड द्वारा सोचर द्वारा जारी किया गया।
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft से) साथ में दिया गया पेपर [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) हैपिंग वू, बिन जिओ, नोएल कोडेला, मेंगचेन लियू, जियांग दाई, लू युआन, लेई झांग द्वारा।
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (फेसबुक से) साथ में कागज [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) एलेक्सी बाएव्स्की, वेई-निंग सू, कियानटोंग जू, अरुण बाबू, जियाताओ गु, माइकल औली द्वारा पोस्ट किया गया।
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft से) साथ में दिया गया पेपर [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) पेंगचेंग हे, ज़ियाओडोंग लियू, जियानफेंग गाओ, वीज़ू चेन द्वारा।
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft से) साथ में दिया गया पेपर [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) पेंगचेंग हे, ज़ियाओडोंग लियू, जियानफेंग गाओ, वीज़ू चेन द्वारा पोस्ट किया गया।
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (बर्कले/फेसबुक/गूगल से) पेपर के साथ [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) लिली चेन, केविन लू, अरविंद राजेश्वरन, किमिन ली, आदित्य ग्रोवर, माइकल लास्किन, पीटर एबील, अरविंद श्रीनिवास, इगोर मोर्डच द्वारा पोस्ट किया गया।
diff --git a/README_ja.md b/README_ja.md
index 0a73ccd406b9..14cd6f688aca 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -348,8 +348,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858)
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google から) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch から公開された研究論文: [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345)
diff --git a/README_ko.md b/README_ko.md
index 156ec78d2677..a2ee0f7dc527 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -263,8 +263,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce 에서) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 의 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 논문과 함께 발표했습니다.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft 에서) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 의 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 논문과 함께 발표했습니다.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook 에서) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 의 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 논문과 함께 발표했습니다.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (Berkeley/Facebook/Google 에서) Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 의 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 논문과 함께 발표했습니다.
diff --git a/README_pt-br.md b/README_pt-br.md
index e0e27e7b0113..dda1d9dc4fa4 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -346,8 +346,8 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_ru.md b/README_ru.md
index 2d638ade7601..020b42b24676 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -336,8 +336,8 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_te.md b/README_te.md
index 4e6079acaa31..46fac43ad86e 100644
--- a/README_te.md
+++ b/README_te.md
@@ -338,8 +338,8 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_vi.md b/README_vi.md
index 552c64673c93..2e89095c50f8 100644
--- a/README_vi.md
+++ b/README_vi.md
@@ -337,8 +337,8 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (từ Salesforce) được phát hành với bài báo [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (từ Microsoft) được phát hành với bài báo [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (từ Facebook) được phát hành với bài báo [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (từ Microsoft) được phát hành với bài báo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (từ Microsoft) được phát hành với bài báo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (từ Berkeley/Facebook/Google) được phát hành với bài báo [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 9f9984584f18..ec0de5d61555 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -287,8 +287,8 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (来自 Berkeley/Facebook/Google) 伴随论文 [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) 由 Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index c43a80c71395..ce13720d92b0 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -299,8 +299,8 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-1. **[Dbrx](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
-1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+
+1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.

From fe12d2a0b268211faa8d652c7e0d516c88b0badd Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 11:58:45 +0000
Subject: [PATCH 126/131] return the aux_loss

---
 src/transformers/models/dbrx/modeling_dbrx.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index ab5da32c7028..99b865c773f8 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1417,6 +1417,8 @@ def forward(
 
         if not return_dict:
             output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
             return (loss,) + output if loss is not None else output
 
         return MoeCausalLMOutputWithPast(

From 58c8342aa12b4f2b457fb9cee4d32a62b2685f5b Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 12:02:55 +0000
Subject: [PATCH 127/131] remove extra spaces

---
 src/transformers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index f52b76a07e61..76433ae97e5c 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4645,7 +4645,7 @@
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     from .utils import (
-        dummy_torchaudio_objects,	
+        dummy_torchaudio_objects,
     )
 
     _import_structure["utils.dummy_torchaudio_objects"] = [

From d04c87022ea7b7a724bced198e2bf89d55372b74 Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 18 Apr 2024 08:05:22 -0400
Subject: [PATCH 128/131] fix configuration_auto.py

---
 src/transformers/models/auto/configuration_auto.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 7c36a8b10880..10af3fbfc163 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -21,19 +21,14 @@
 from typing import List, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import (
-    get_class_from_dynamic_module,
-    resolve_trust_remote_code,
-)
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...utils import CONFIG_NAME, logging
 
 
 logger = logging.get_logger(__name__)
 
 
-from ..deprecated._archive_maps import (  # noqa: F401, E402
-    CONFIG_ARCHIVE_MAP_MAPPING_NAMES,
-)
+from ..deprecated._archive_maps import CONFIG_ARCHIVE_MAP_MAPPING_NAMES  # noqa: F401, E402
 
 
 CONFIG_MAPPING_NAMES = OrderedDict(

From 22804bf52405bf513b2398387b38f725f493250e Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 18 Apr 2024 08:06:11 -0400
Subject: [PATCH 129/131] fix format in tokenization_auto

---
 src/transformers/models/auto/tokenization_auto.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 413bfe001446..e5f9bd2e1544 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -22,10 +22,7 @@
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
 from ...configuration_utils import PretrainedConfig
-from ...dynamic_module_utils import (
-    get_class_from_dynamic_module,
-    resolve_trust_remote_code,
-)
+from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
 from ...utils import (

From 95b327f4c415f62023aa30bc83f3e8b16f5e0e9a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 12:36:03 +0000
Subject: [PATCH 130/131] remove new line

---
 README.md         | 1 -
 README_de.md      | 1 -
 README_es.md      | 1 -
 README_fr.md      | 1 -
 README_hd.md      | 1 -
 README_ja.md      | 1 -
 README_ko.md      | 1 -
 README_pt-br.md   | 1 -
 README_ru.md      | 1 -
 README_te.md      | 1 -
 README_vi.md      | 1 -
 README_zh-hans.md | 1 -
 README_zh-hant.md | 1 -
 13 files changed, 13 deletions(-)

diff --git a/README.md b/README.md
index 901b56347446..87d4ada7f36c 100644
--- a/README.md
+++ b/README.md
@@ -341,7 +341,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
diff --git a/README_de.md b/README_de.md
index cacd7868bc31..13a3c7d5cfc5 100644
--- a/README_de.md
+++ b/README_de.md
@@ -337,7 +337,6 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
diff --git a/README_es.md b/README_es.md
index ad9122748ba8..0e149c3cacff 100644
--- a/README_es.md
+++ b/README_es.md
@@ -314,7 +314,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
diff --git a/README_fr.md b/README_fr.md
index e00845d9ee49..41173fcf1c77 100644
--- a/README_fr.md
+++ b/README_fr.md
@@ -335,7 +335,6 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (de Salesforce) publié dans l'article [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) par Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong et Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (de Microsoft) publié dans l'article [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) par Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (de Facebook) publié dans l'article [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) par Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (de Microsoft) publié dans l'article [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) par Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
diff --git a/README_hd.md b/README_hd.md
index 8a48828de3a4..f30087486c88 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -288,7 +288,6 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (सेल्सफोर्स से) साथ में पेपर [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) नीतीश शिरीष केसकर*, ब्रायन मैककैन*, लव आर. वार्ष्णेय, कैमिंग जिओंग और रिचर्ड द्वारा सोचर द्वारा जारी किया गया।
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft से) साथ में दिया गया पेपर [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) हैपिंग वू, बिन जिओ, नोएल कोडेला, मेंगचेन लियू, जियांग दाई, लू युआन, लेई झांग द्वारा।
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (फेसबुक से) साथ में कागज [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) एलेक्सी बाएव्स्की, वेई-निंग सू, कियानटोंग जू, अरुण बाबू, जियाताओ गु, माइकल औली द्वारा पोस्ट किया गया।
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft से) साथ में दिया गया पेपर [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) पेंगचेंग हे, ज़ियाओडोंग लियू, जियानफेंग गाओ, वीज़ू चेन द्वारा।
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft से) साथ में दिया गया पेपर [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) पेंगचेंग हे, ज़ियाओडोंग लियू, जियानफेंग गाओ, वीज़ू चेन द्वारा पोस्ट किया गया।
diff --git a/README_ja.md b/README_ja.md
index 14cd6f688aca..66e30b68b781 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -348,7 +348,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce から) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher から公開された研究論文: [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858)
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft から) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang から公開された研究論文: [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808)
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook から) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli から公開された研究論文: [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555)
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft から) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen から公開された研究論文: [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654)
diff --git a/README_ko.md b/README_ko.md
index a2ee0f7dc527..31b5c41e9fac 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -263,7 +263,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (Salesforce 에서) Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 의 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 논문과 함께 발표했습니다.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (Microsoft 에서) Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 의 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 논문과 함께 발표했습니다.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (Facebook 에서) Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 의 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 논문과 함께 발표했습니다.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (Microsoft 에서) Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 의 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 논문과 함께 발표했습니다.
diff --git a/README_pt-br.md b/README_pt-br.md
index dda1d9dc4fa4..a1314cc10f51 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -346,7 +346,6 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
diff --git a/README_ru.md b/README_ru.md
index 020b42b24676..be7a0a150620 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -336,7 +336,6 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
diff --git a/README_te.md b/README_te.md
index 46fac43ad86e..3a58aeb07b33 100644
--- a/README_te.md
+++ b/README_te.md
@@ -338,7 +338,6 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
diff --git a/README_vi.md b/README_vi.md
index 2e89095c50f8..80032c8d02c8 100644
--- a/README_vi.md
+++ b/README_vi.md
@@ -337,7 +337,6 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (từ Salesforce) được phát hành với bài báo [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (từ Microsoft) được phát hành với bài báo [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (từ Facebook) được phát hành với bài báo [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (từ Microsoft) được phát hành với bài báo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (từ Microsoft) được phát hành với bài báo [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index ec0de5d61555..435552507f74 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -287,7 +287,6 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (来自 Microsoft) 伴随论文 [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) 由 Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang 发布。
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index ce13720d92b0..ae745aeaa315 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -299,7 +299,6 @@ conda install conda-forge::transformers
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
-
 1. **[DBRX](https://huggingface.co/docs/transformers/main/model_doc/dbrx)** (from Databricks) released with the paper [Introducing DBRX: A New State-of-the-Art Open LLM](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm) by the Mosaic Research Team.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.

From c6cbbda8bb3bd434459e69009dcfa71e1e10f2f6 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@databricks.com>
Date: Thu, 18 Apr 2024 12:36:24 +0000
Subject: [PATCH 131/131] add more useage examples

---
 docs/source/en/model_doc/dbrx.md | 53 ++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index 1192c22021f2..33435462b3e0 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -36,14 +36,63 @@ This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) a
 
 ## Usage Examples
 
-The `generate()` method can be used to generate text using DBRX.
+The `generate()` method can be used to generate text using DBRX. You can generate using the standard attention implementation, flash-attention, and the PyTorch scaled dot product attention. The last two attention implementations give speed ups.
 
 ```python
 from transformers import DbrxForCausalLM, AutoTokenizer
 import torch
 
 tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOUR_HF_TOKEN")
-model = DbrxForCausalLM.from_pretrained("databricks/dbrx-instruct", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True, token="YOUR_HF_TOKEN")
+model = DbrxForCausalLM.from_pretrained(
+    "databricks/dbrx-instruct",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    token="YOUR_HF_TOKEN",
+    )
+
+input_text = "What does it take to build a great LLM?"
+messages = [{"role": "user", "content": input_text}]
+input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=200)
+print(tokenizer.decode(outputs[0]))
+```
+
+If you have flash-attention installed (`pip install flash-attn`), it is possible to generate faster. (The HuggingFace documentation for flash-attention can be found [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2).)
+```python
+from transformers import DbrxForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOUR_HF_TOKEN")
+model = DbrxForCausalLM.from_pretrained(
+    "databricks/dbrx-instruct",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    token="YOUR_HF_TOKEN",
+    attn_implementation="flash_attention_2",
+    )
+
+input_text = "What does it take to build a great LLM?"
+messages = [{"role": "user", "content": input_text}]
+input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=200)
+print(tokenizer.decode(outputs[0]))
+```
+
+You can also generate faster using the PyTorch scaled dot product attention. (The HuggingFace documentation for scaled dot product attention can be found [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention).)
+```python
+from transformers import DbrxForCausalLM, AutoTokenizer
+import torch
+
+tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOUR_HF_TOKEN")
+model = DbrxForCausalLM.from_pretrained(
+    "databricks/dbrx-instruct",
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    token="YOUR_HF_TOKEN",
+    attn_implementation="sdpa",
+    )
 
 input_text = "What does it take to build a great LLM?"
 messages = [{"role": "user", "content": input_text}]