hpcaitech · ver217 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/colossalai/shardformer/modeling/mistral.py b/colossalai/shardformer/modeling/mistral.py
@@ -4,7 +4,10 @@
 import torch
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -77,7 +80,7 @@ def mistral_model_forward(
         else:
             position_ids = position_ids.view(-1, seq_length).long()
 
-        if attention_mask is not None and self._use_flash_attention_2 and use_cache:
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
             is_padding_right = attention_mask[:, -1].sum().item() != batch_size
             if is_padding_right:
                 raise ValueError(
@@ -97,9 +100,18 @@ def mistral_model_forward(
                 is_causal=True,
             )
         else:
-            if self._use_flash_attention_2:
+            if self._attn_implementation == "flash_attention_2":
                 # 2d mask is passed through the layers
                 attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            elif self._attn_implementation == "sdpa" and not output_attentions:
+                # output_attentions=True can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    (batch_size, seq_length),
+                    inputs_embeds,
+                    past_key_values_length,
+                )
             else:
                 # 4d mask is passed through the layers
                 attention_mask = _prepare_4d_causal_attention_mask(
@@ -462,7 +474,7 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        if attention_mask is not None and self._use_flash_attention_2 and use_cache:
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
             is_padding_right = attention_mask[:, -1].sum().item() != batch_size
             if is_padding_right:
                 raise ValueError(
@@ -481,9 +493,18 @@ def forward(
                 is_causal=True,
             )
         else:
-            if self._use_flash_attention_2:
+            if self._attn_implementation == "flash_attention_2":
                 # 2d mask is passed through the layers
                 attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            elif self._attn_implementation == "sdpa" and not output_attentions:
+                # output_attentions=True can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    (batch_size, seq_length),
+                    inputs_embeds,
+                    past_key_values_length,
+                )
             else:
                 # 4d mask is passed through the layers
                 attention_mask = _prepare_4d_causal_attention_mask(

diff --git a/colossalai/shardformer/policies/mistral.py b/colossalai/shardformer/policies/mistral.py
@@ -42,11 +42,13 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             MistralDecoderLayer,
             MistralFlashAttention2,
             MistralModel,
+            MistralSdpaAttention,
         )
 
         ATTN_IMPLEMENTATION = {
             "eager": MistralAttention,
             "flash_attention_2": MistralFlashAttention2,
+            "sdpa": MistralSdpaAttention,
         }
 
         policy = {}

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -16,7 +16,7 @@ ray
 sentencepiece
 google
 protobuf
-transformers>=4.36.2,<4.40.0
+transformers==4.39.3
 peft>=0.7.1
 bitsandbytes>=0.39.0
 rpyc==6.0.0