Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/en/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1048,7 +1048,7 @@
- local: model_doc/llama4
title: Llama4
- local: model_doc/llava
title: Llava
title: LLaVA
- local: model_doc/llava_next
title: LLaVA-NeXT
- local: model_doc/llava_next_video
Expand Down
19 changes: 8 additions & 11 deletions docs/source/en/model_doc/llava_onevision.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ yielding new emerging capabilities. In particular, strong video understanding an
cross-scenario capabilities are demonstrated through task transfer from images to
videos.*

<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava-ov-acrhitecture.png"
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava-ov-architecture.png"
alt="drawing" width="600"/>

<small> LLaVA-OneVision architecture. Taken from the <a href="https://huggingface.co/papers/2408.03326">original paper.</a> </small>
Expand Down Expand Up @@ -165,20 +165,20 @@ conversation_1 = [
"content": [
{"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
{"type": "text", "text": "What is shown in this image?"},
],
],
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "There is a red stop sign in the image."},
],
],
},
{
"role": "user",
"content": [
{"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
{"type": "text", "text": "What about this image? How many cats do you see?"},
],
],
},
]

Expand All @@ -188,7 +188,7 @@ conversation_2 = [
"content": [
{"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
{"type": "text", "text": "What is shown in this image?"},
],
],
},
]

Expand All @@ -198,13 +198,14 @@ inputs = processor.apply_chat_template(
tokenize=True,
return_dict=True,
padding=True,
return_tensors="pt"
padding_side="left",
return_tensors="pt",
).to(model.device, torch.float16)

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=30)
processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
['user\n\nWhat is shown in this image?\nassistant\nThere is a red stop sign in the image.\nuser\n\nWhat about this image? How many cats do you see?\nassistant\ntwo', 'user\n\nWhat is shown in this image?\nassistant\n']
['user\n\nWhat is shown in this image?\nassistant\nThere is a red stop sign in the image.\nuser\n\nWhat about this image? How many cats do you see?\nassistant\ntwo', 'user\n\nWhat is shown in this image?\nassistant\nThe image shows a whimsical scene of a snowman sitting by a campfire. The snowman is anthropomorphized, wearing a hat and']
```

### Video inference
Expand Down Expand Up @@ -312,10 +313,6 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(

[[autodoc]] LlavaOnevisionVideoProcessor

## LlavaOnevisionVideoProcessor

[[autodoc]] LlavaOnevisionVideoProcessor

## LlavaOnevisionModel

[[autodoc]] LlavaOnevisionModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,7 @@ def preprocess(

if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
# if the first element is a list, we assume that all elements are lists
images = [x for x in images if x] # handle text-only case
batch_num_images = [len(x) for x in images]
elif isinstance(images, (tuple, list)):
# treat this as a single-image case for backward compatibility
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ def pack_image_features(self, image_features, image_sizes, image_newline=None, v
image_feature = image_feature[0]
if image_newline is not None:
image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
image_feature = image_feature.flatten(0, 1)
new_image_features.append(image_feature)
feature_lens.append(image_feature.size(0))
feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,6 @@ def pack_image_features(self, image_features, image_sizes, image_newline=None, v
image_feature = image_feature[0]
if image_newline is not None:
image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
image_feature = image_feature.flatten(0, 1)
new_image_features.append(image_feature)
feature_lens.append(image_feature.size(0))
feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,14 +215,15 @@ def _expand_image_tokens(
max_num_vision_tokens = 0
for sample in text:
if special_token in sample:
is_multi_image = next(batch_num_images) != 1
num_images = next(batch_num_images) # should consume iterable
is_multi_image = num_images != 1
else:
is_multi_image = False
while special_token in sample:
original_size = next(image_sizes) # should consume iterable
if is_multi_image:
num_image_tokens = self.num_image_tokens + 1 # one for image_newline
else:
original_size = next(image_sizes)
if not isinstance(original_size, (list, tuple)):
# cast to list to avoid numerical precision errors when calculating unpadding
original_size = original_size.tolist()
Expand Down
25 changes: 15 additions & 10 deletions tests/models/llava_onevision/test_modeling_llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,20 +446,25 @@ def test_small_model_integration_test_multi_image_nested(self):

url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image = Image.open(requests.get(url, stream=True).raw)
prompt = (
"user\n<image><image>\nWhat is the difference between these images?<|im_end|>\n<|im_start|>assistant\n"
)
images_nested = [[self.image, image]]
inputs = self.processor(text=prompt, images=images_nested, return_tensors="pt").to(torch_device, torch.float16)
prompts = [
"user\nTell me about the french revolution.<|im_end|>\n<|im_start|>assistant\n", # text-only case
"user\n<image><image>\nWhat is the difference between these images?<|im_end|>\n<|im_start|>assistant\n",
self.prompt_image,
]
images_nested = [[], [image, self.image], [self.image]]
inputs = self.processor(
text=prompts,
images=images_nested,
return_tensors="pt",
padding=True,
).to(torch_device, torch.float16)

# verify generation
output = model.generate(**inputs, max_new_tokens=40)
EXPECTED_DECODED_TEXT = "user\n\nWhat is the difference between these images?\nassistant\nThe first image is a radar chart showing the performance of different models in a specific task, while the second image is a street scene with a stop sign in the foreground." # fmt: skip
EXPECTED_DECODED_TEXT = ["user\nTell me about the french revolution.\nassistant\nThe French Revolution! A pivotal event in modern history that had a profound impact on the course of Western civilization. Here's a brief overview:\n\n**Background**\n\nIn the late 18th century,", "user\n\nWhat is the difference between these images?\nassistant\nThe first image shows a stop sign with a traditional Chinese architectural background, while the second image displays a radar chart with various algorithms and models, including BLIP-2, InstructBLIP, Q", "user\n\nWhat do you see in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into several axes, each representing a different"] # fmt: skip
DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True)

self.assertEqual(
self.processor.decode(output[0], skip_special_tokens=True),
EXPECTED_DECODED_TEXT,
)
self.assertListEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT)

@slow
@require_bitsandbytes
Expand Down