From 04ea11d8880b62cea38d4eafaf47b49608287719 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 15 Aug 2024 08:12:35 +0200 Subject: [PATCH 01/27] working version --- docs/source/en/_toctree.yml | 2 + docs/source/en/model_doc/llava-onevision.md | 300 ++++++++ src/transformers/__init__.py | 19 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + src/transformers/models/auto/modeling_auto.py | 2 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + .../models/llava_onevision/__init__.py | 72 ++ .../configuration_llava_onevision.py | 194 +++++ .../convert_llava_onevision_weights_to_hf.py | 360 +++++++++ .../image_processing_llava_onevision.py | 711 ++++++++++++++++++ .../modeling_llava_onevision.py | 666 ++++++++++++++++ .../processing_llava_onevision.py | 264 +++++++ .../video_processing_llava_onevision.py | 711 ++++++++++++++++++ tests/models/llava_onevision/__init__.py | 0 .../test_modeling_llava_onevision.py | 553 ++++++++++++++ 17 files changed, 3859 insertions(+) create mode 100644 docs/source/en/model_doc/llava-onevision.md create mode 100644 src/transformers/models/llava_onevision/__init__.py create mode 100644 src/transformers/models/llava_onevision/configuration_llava_onevision.py create mode 100644 src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py create mode 100644 src/transformers/models/llava_onevision/image_processing_llava_onevision.py create mode 100644 src/transformers/models/llava_onevision/modeling_llava_onevision.py create mode 100644 src/transformers/models/llava_onevision/processing_llava_onevision.py create mode 100644 src/transformers/models/llava_onevision/video_processing_llava_onevision.py create mode 100644 tests/models/llava_onevision/__init__.py create mode 100644 tests/models/llava_onevision/test_modeling_llava_onevision.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index f9a484269afc..d38e77aed5cf 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -820,6 +820,8 @@ title: LLaVA-NeXT - local: model_doc/llava-next-video title: LLaVa-NeXT-Video + - local: model_doc/llava-onevision + title: LLaVA-Onevision - local: model_doc/lxmert title: LXMERT - local: model_doc/matcha diff --git a/docs/source/en/model_doc/llava-onevision.md b/docs/source/en/model_doc/llava-onevision.md new file mode 100644 index 000000000000..b132b4570c82 --- /dev/null +++ b/docs/source/en/model_doc/llava-onevision.md @@ -0,0 +1,300 @@ + + +# LLaVA-Onevision + +## Overview + +The LLaVA-Onevision model was proposed in [LLaVA-OneVision: Easy Visual Task Transfer](https://arxiv.org/abs/2408.03326) by + +- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding". + + + +- Note that the model should use a specific prompt format, on which the large language model (LLM) was trained. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. + +We will use [llava-onevision-qwen2-7b-si-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-si-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: + +```python +from transformers import AutoProcessor + +processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-si-hf") + +conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What’s shown in this image?"}, + ], + }, + { + "role": "assistant", + "content": [{"type": "text", "text": "This image shows a red stop sign."},] + }, + { + + "role": "user", + "content": [ + {"type": "text", "text": "Describe the image in more details."}, + ], + }, +] + +text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + +# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images +print(text_prompt) +>>> "<|im_start|>user\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>" +``` + +This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). +The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main). + + +## Usage example + +### Single image inference + +Here's how to load the model and perform inference in half-precision (`torch.float16`): + +```python +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration +import torch +from PIL import Image +import requests + +processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-si-hf") + +model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-si-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) +model.to("cuda:0") + +# prepare image and text prompt, using the appropriate prompt template +url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" +image = Image.open(requests.get(url, stream=True).raw) + +conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, +] +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +inputs = processor(prompt, image, return_tensors="pt").to("cuda:0") + +# autoregressively complete prompt +output = model.generate(**inputs, max_new_tokens=100) + +print(processor.decode(output[0], skip_special_tokens=True)) +``` + +### Multi image inference + +LLaVa-Onevision can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). For that you have to use checkpoints with an "ov" suffix. Here is how you can do it: + +```python +import requests +from PIL import Image +import torch +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration + +# Load the model in half-precision +model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, device_map="auto") +processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") + +# Get three different images +url = "https://www.ilankelman.org/stopsigns/australia.jpg" +image_stop = Image.open(requests.get(url, stream=True).raw) + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image_cats = Image.open(requests.get(url, stream=True).raw) + +url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg" +image_snowman = Image.open(requests.get(url, stream=True).raw) + +# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not +conversation_1 = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "There is a red stop sign in the image."}, + ], + }, + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What about this image? How many cats do you see?"}, + ], + }, +] + +conversation_2 = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, +] + +prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True) +prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True) +prompts = [prompt_1, prompt_2] + +# We can simply feed images in the order they have to be used in the text prompt +inputs = processor(text=prompts, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(model.device) + +# Generate +generate_ids = model.generate(**inputs, max_new_tokens=30) +processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) +``` + +### Video inference + +LLaVa-Onevision can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it: + +```python +import av +import numpy as np +from huggingface_hub import hf_hub_download + +import torch +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration + +# Load the model in half-precision +model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, device_map="auto") +processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") + + +def read_video_pyav(container, indices): + ''' + Decode the video with PyAV decoder. + Args: + container (`av.container.input.InputContainer`): PyAV container. + indices (`List[int]`): List of frame indices to decode. + Returns: + result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ''' + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + +# Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos, up to 32 frames) +video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset") +container = av.open(video_path) +total_frames = container.streams.video[0].frames +indices = np.arange(0, total_frames, total_frames / 8).astype(int) +video = read_video_pyav(container, indices) + +# For videos we have to feed a "video" type instead of "image" +conversation = [ + { + + "role": "user", + "content": [ + {"type": "video"}, + {"type": "text", "text": "Why is this video funny?"}, + ], + }, +] + +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +inputs = processor(text=prompt, images=list(video), return_tensors="pt") + +out = model.generate(**inputs, max_new_tokens=60) +processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) + +``` + +## Model optimization + +### Quantization using Bitsandbytes + +The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with: + +```python +from transformers import LlavaOnevisionForConditionalGeneration, BitsAndBytesConfig + +# specify how to quantize the model +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, +) + +model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto") +``` + +### Use Flash-Attention 2 to further speed-up generation + +First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with: + +```python +from transformers import LlavaOnevisionForConditionalGeneration + +model = LlavaOnevisionForConditionalGeneration.from_pretrained( + model_id, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + use_flash_attention_2=True +).to(0) +``` + + +## LlavaOnevisionConfig + +[[autodoc]] LlavaOnevisionConfig + +## LlavaOnevisionForConditionalGeneration + +[[autodoc]] LlavaOnevisionForConditionalGeneration + - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index b291ee828933..a5210e8fa27d 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -529,6 +529,7 @@ "LlavaNextVideoConfig", "LlavaNextVideoProcessor", ], + "models.llava_onevision": ["LlavaOnevisionConfig", "LlavaOnevisionProcessor"], "models.longformer": [ "LongformerConfig", "LongformerTokenizer", @@ -1172,6 +1173,9 @@ _import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"]) _import_structure["models.llava_next"].append("LlavaNextImageProcessor") _import_structure["models.llava_next_video"].append("LlavaNextVideoImageProcessor") + _import_structure["models.llava_onevision"].extend( + ["LlavaOnevisionImageProcessor", "LlavaOnevisionVideoProcessor"] + ) _import_structure["models.mask2former"].append("Mask2FormerImageProcessor") _import_structure["models.maskformer"].extend(["MaskFormerFeatureExtractor", "MaskFormerImageProcessor"]) _import_structure["models.mobilenet_v1"].extend(["MobileNetV1FeatureExtractor", "MobileNetV1ImageProcessor"]) @@ -2497,6 +2501,12 @@ "LlavaNextVideoPreTrainedModel", ] ) + _import_structure["models.llava_onevision"].extend( + [ + "LlavaOnevisionForConditionalGeneration", + "LlavaOnevisionPreTrainedModel", + ] + ) _import_structure["models.longformer"].extend( [ "LongformerForMaskedLM", @@ -5245,6 +5255,10 @@ LlavaNextVideoConfig, LlavaNextVideoProcessor, ) + from .models.llava_onevision import ( + LlavaOnevisionConfig, + LlavaOnevisionProcessor, + ) from .models.longformer import ( LongformerConfig, LongformerTokenizer, @@ -5923,6 +5937,7 @@ from .models.levit import LevitFeatureExtractor, LevitImageProcessor from .models.llava_next import LlavaNextImageProcessor from .models.llava_next_video import LlavaNextVideoImageProcessor + from .models.llava_onevision import LlavaOnevisionImageProcessor, LlavaOnevisionVideoProcessor from .models.mask2former import Mask2FormerImageProcessor from .models.maskformer import ( MaskFormerFeatureExtractor, @@ -7025,6 +7040,10 @@ LlavaNextVideoForConditionalGeneration, LlavaNextVideoPreTrainedModel, ) + from .models.llava_onevision import ( + LlavaOnevisionForConditionalGeneration, + LlavaOnevisionPreTrainedModel, + ) from .models.longformer import ( LongformerForMaskedLM, LongformerForMultipleChoice, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 8e917af7c681..8f5f25907675 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -129,6 +129,7 @@ llava, llava_next, llava_next_video, + llava_onevision, longformer, longt5, luke, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index d06c99e18f36..f0781d9bce8a 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -145,6 +145,7 @@ ("llama", "LlamaConfig"), ("llava", "LlavaConfig"), ("llava-next-video", "LlavaNextVideoConfig"), + ("llava-onevision", "LlavaOnevisionConfig"), ("llava_next", "LlavaNextConfig"), ("longformer", "LongformerConfig"), ("longt5", "LongT5Config"), @@ -435,6 +436,7 @@ ("llama3", "Llama3"), ("llava", "LLaVa"), ("llava-next-video", "LLaVa-NeXT-Video"), + ("llava-onevision", "LLaVA-Onevision"), ("llava_next", "LLaVA-NeXT"), ("longformer", "Longformer"), ("longt5", "LongT5"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 0cf0752e1060..2f785061cb1d 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -307,6 +307,7 @@ ("layoutlm", "LayoutLMForMaskedLM"), ("llava", "LlavaForConditionalGeneration"), ("llava-next-video", "LlavaNextVideoForConditionalGeneration"), + ("llava-onevision", "LlavaOnevisionForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("longformer", "LongformerForMaskedLM"), ("luke", "LukeForMaskedLM"), @@ -718,6 +719,7 @@ ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), ("llava-next-video", "LlavaNextVideoForConditionalGeneration"), + ("llava-onevision", "LlavaOnevisionForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("paligemma", "PaliGemmaForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 7877343d5318..4722a5b163f5 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -72,6 +72,7 @@ ("layoutlmv3", "LayoutLMv3Processor"), ("llava", "LlavaProcessor"), ("llava-next-video", "LlavaNextVideoProcessor"), + ("llava-onevision", "LlavaOnevisionProcessor"), ("llava_next", "LlavaNextProcessor"), ("markuplm", "MarkupLMProcessor"), ("mctct", "MCTCTProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 5df108a0faf3..b250ed234c82 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -257,6 +257,7 @@ ), ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava-next-video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("llava-onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), ( diff --git a/src/transformers/models/llava_onevision/__init__.py b/src/transformers/models/llava_onevision/__init__.py new file mode 100644 index 000000000000..f16948a8f740 --- /dev/null +++ b/src/transformers/models/llava_onevision/__init__.py @@ -0,0 +1,72 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_llava_onevision": ["LlavaOnevisionConfig"], + "processing_llava_onevision": ["LlavaOnevisionProcessor"], +} + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_llava_onevision"] = ["LlavaOnevisionImageProcessor"] + + _import_structure["video_processing_llava_onevision"] = ["LlavaOnevisionVideoProcessor"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_llava_onevision"] = [ + "LlavaOnevisionForConditionalGeneration", + "LlavaOnevisionPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_llava_onevision import LlavaOnevisionConfig + from .processing_llava_onevision import LlavaOnevisionProcessor + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_llava_onevision import LlavaOnevisionImageProcessor + from .video_processing_llava_onevision import LlavaOnevisionVideoProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_llava_onevision import ( + LlavaOnevisionForConditionalGeneration, + LlavaOnevisionPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py new file mode 100644 index 000000000000..707123c8da31 --- /dev/null +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -0,0 +1,194 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from . +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the diff. If any change should be done, please apply the change to the +# diff.py file directly. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...configuration_utils import PretrainedConfig +from ...utils import ( + logging, +) +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class LlavaOnevisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlavaOnevisionForConditionalGeneration`]. It is used to instantiate an + Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf) + model. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `SiglipVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`): + The config object or dictionary of the text backbone. + ignore_index (`int`, *optional*, defaults to -100): + The ignore index for the loss function. + image_token_index (`int`, *optional*, defaults to 151646): + The image token index to encode the image prompt. + video_token_index (`int`, *optional*, defaults to 151647): + The video token index to encode the video prompt. + projector_hidden_act (`str`, *optional*, defaults to `"gelu"`): + The activation function used by the multimodal projector. + vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + vision_aspect_ratio (`str`, *optional*, "anyres_max_9"): + Aspect ratio used when processong image features. The default value is "anyres_max_9". + image_grid_pinpoints (`List`, *optional*): + A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list + of the form `(height, width)`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + + Example: + + ```python + >>> from transformers import LlavaOnevisionForConditionalGeneration, LlavaOnevisionConfig, SiglipVisionConfig, Qwen2Config + + >>> # Initializing a CLIP-vision config + >>> vision_config = SiglipVisionConfig() + + >>> # Initializing a Llama config + >>> text_config = Qwen2Config() + + >>> # Initializing a Llava-Next llava-hf/llava-onevision-qwen2-7b-ov-hf style configuration + >>> configuration = LlavaOnevisionConfig(vision_config, text_config) + + >>> # Initializing a model from the llava-hf/llava-onevision-qwen2-7b-ov-hf style configuration + >>> model = LlavaOnevisionForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llava_onevision" + is_composition = False + + def __init__( + self, + vision_config=None, + text_config=None, + ignore_index=-100, + image_token_index=151646, + video_token_index=151647, + projector_hidden_act="gelu", + vision_feature_select_strategy="full", + vision_feature_layer=-1, + vision_aspect_ratio="anyres_max_9", + image_grid_pinpoints=None, + tie_word_embeddings=False, + **kwargs, + ): + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.video_token_index = video_token_index + self.projector_hidden_act = projector_hidden_act + + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError( + "vision_feature_select_strategy should be one of 'default', 'full'." + f"Got: {vision_feature_select_strategy}" + ) + + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.vision_aspect_ratio = vision_aspect_ratio + image_grid_pinpoints = ( + image_grid_pinpoints + if image_grid_pinpoints is not None + else [ + [384, 384], + [384, 768], + [384, 1152], + [384, 1536], + [384, 1920], + [384, 2304], + [768, 384], + [768, 768], + [768, 1152], + [768, 1536], + [768, 1920], + [768, 2304], + [1152, 384], + [1152, 768], + [1152, 1152], + [1152, 1536], + [1152, 1920], + [1152, 2304], + [1536, 384], + [1536, 768], + [1536, 1152], + [1536, 1536], + [1536, 1920], + [1536, 2304], + [1920, 384], + [1920, 768], + [1920, 1152], + [1920, 1536], + [1920, 1920], + [1920, 2304], + [2304, 384], + [2304, 768], + [2304, 1152], + [2304, 1536], + [2304, 1920], + [2304, 2304], + ] + ) + self.image_grid_pinpoints = image_grid_pinpoints + + if isinstance(vision_config, dict): + vision_config["model_type"] = ( + vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model" + ) + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["siglip_vision_model"]( + intermediate_size=4096, + hidden_size=1024, + patch_size=14, + image_size=336, + num_hidden_layers=24, + num_attention_heads=16, + vocab_size=32000, + projection_dim=768, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "qwen2" + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["qwen2"]() + + self.text_config = text_config + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py new file mode 100644 index 000000000000..e505ce94e213 --- /dev/null +++ b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py @@ -0,0 +1,360 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Convert LLaVa-Onevision checkpoints from the original repository. + +URL: https://github.com/LLaVA-VL/LLaVA-NeXT/tree/main + +""" + +import argparse +import gc +import glob +import json +from pathlib import Path + +import requests +import torch +from accelerate import init_empty_weights +from huggingface_hub import hf_hub_download, snapshot_download +from PIL import Image +from safetensors import safe_open + +from transformers import ( + AddedToken, + AutoConfig, + AutoTokenizer, + LlavaOnevisionConfig, + LlavaOnevisionForConditionalGeneration, + LlavaOnevisionImageProcessor, + LlavaOnevisionProcessor, + LlavaOnevisionVideoProcessor, + SiglipVisionConfig, +) + + +KEYS_TO_MODIFY_MAPPING = { + "model.vision_tower.": "", + "model.mm_projector": "multi_modal_projector", + "model": "model.model", + "vision_model.model": "vision_model", + "lm_head": "language_model.lm_head", + "model.model": "language_model.model", + "multi_modal_projector.0": "multi_modal_projector.linear_1", + "multi_modal_projector.2": "multi_modal_projector.linear_2", + "language_model.model.image_newline": "image_newline", +} + +chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '\n' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '