From 201479e89618bae34f9b7764cfea72084161acb3 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Thu, 25 Jul 2024 11:03:56 +0000 Subject: [PATCH 1/3] Fix llava, llava_next, video_llava, vipllava --- src/transformers/models/llava/modeling_llava.py | 4 +++- src/transformers/models/llava_next/modeling_llava_next.py | 4 +++- src/transformers/models/video_llava/modeling_video_llava.py | 4 ++-- src/transformers/models/vipllava/modeling_vipllava.py | 4 +++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 0426776beed1..4c346002f861 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -236,7 +236,9 @@ def _supports_sdpa(self): class LlavaForConditionalGeneration(LlavaPreTrainedModel): def __init__(self, config: LlavaConfig): super().__init__(config) - self.vision_tower = AutoModel.from_config(config.vision_config) + self.vision_tower = AutoModel.from_config( + config.vision_config, attn_implementation=config._attn_implementation + ) self.multi_modal_projector = LlavaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index ad76561df54f..5db51fe3c6ac 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -345,7 +345,9 @@ def _supports_sdpa(self): class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel): def __init__(self, config: LlavaNextConfig): super().__init__(config) - self.vision_tower = AutoModel.from_config(config.vision_config) + self.vision_tower = AutoModel.from_config( + config.vision_config, attn_implementation=config._attn_implementation + ) self.multi_modal_projector = LlavaNextMultiModalProjector(config) embed_std = 1 / math.sqrt(config.text_config.hidden_size) diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index cb54c433fde8..d948479b5bb4 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -237,8 +237,8 @@ def _supports_sdpa(self): class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel): def __init__(self, config: VideoLlavaConfig): super().__init__(config) - self.video_tower = AutoModel.from_config(config.vision_config) - self.image_tower = AutoModel.from_config(config.vision_config) + self.video_tower = AutoModel.from_config(config.vision_config, attn_implementation=config._attn_implementation) + self.image_tower = AutoModel.from_config(config.vision_config, attn_implementation=config._attn_implementation) self.multi_modal_projector = VideoLlavaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index c5f856e78745..0a584912a08c 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -241,7 +241,9 @@ def _supports_sdpa(self): class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel): def __init__(self, config: VipLlavaConfig): super().__init__(config) - self.vision_tower = AutoModel.from_config(config.vision_config) + self.vision_tower = AutoModel.from_config( + config.vision_config, attn_implementation=config._attn_implementation + ) self.multi_modal_projector = VipLlavaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size From 1dc199276a00027e548c0be9f96a629e0ab20779 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Thu, 25 Jul 2024 11:16:54 +0000 Subject: [PATCH 2/3] Fix llava_next_video --- .../models/llava_next_video/modeling_llava_next_video.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index e3264dfd91e1..b972420af864 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -388,7 +388,9 @@ def __init__( config: LlavaNextVideoConfig, ): super().__init__(config) - self.vision_tower = AutoModel.from_config(config.vision_config) + self.vision_tower = AutoModel.from_config( + config.vision_config, attn_implementation=config._attn_implementation + ) self.multi_modal_projector = LlavaNextVideoMultiModalProjector(config) embed_std = 1 / math.sqrt(config.text_config.hidden_size) From 64045251b46d48db1f6cfd28593aefe3f9e52d38 Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Thu, 25 Jul 2024 11:27:56 +0000 Subject: [PATCH 3/3] [run-slow] llava, llava_next, video_llava, vipllava, llava_next_video