huggingface · zucchini-nlp · Aug 12, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 11, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1048,7 +1048,7 @@
       - local: model_doc/llama4
         title: Llama4
       - local: model_doc/llava
-        title: Llava
+        title: LLaVA
       - local: model_doc/llava_next
         title: LLaVA-NeXT
       - local: model_doc/llava_next_video

diff --git a/docs/source/en/model_doc/llava_onevision.md b/docs/source/en/model_doc/llava_onevision.md
@@ -38,7 +38,7 @@ yielding new emerging capabilities. In particular, strong video understanding an
 cross-scenario capabilities are demonstrated through task transfer from images to
 videos.*
 
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava-ov-acrhitecture.png"
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava-ov-architecture.png"
 alt="drawing" width="600"/>
 
 <small> LLaVA-OneVision architecture. Taken from the <a href="https://huggingface.co/papers/2408.03326">original paper.</a> </small>
@@ -165,20 +165,20 @@ conversation_1 = [
         "content": [
             {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
             {"type": "text", "text": "What is shown in this image?"},
-            ],
+        ],
     },
     {
         "role": "assistant",
         "content": [
             {"type": "text", "text": "There is a red stop sign in the image."},
-            ],
+        ],
     },
     {
         "role": "user",
         "content": [
             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
             {"type": "text", "text": "What about this image? How many cats do you see?"},
-            ],
+        ],
     },
 ]
 
@@ -188,7 +188,7 @@ conversation_2 = [
         "content": [
             {"type": "image", "url": "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"},
             {"type": "text", "text": "What is shown in this image?"},
-            ],
+        ],
     },
 ]
 
@@ -198,13 +198,14 @@ inputs = processor.apply_chat_template(
     tokenize=True,
     return_dict=True,
     padding=True,
-    return_tensors="pt"
+    padding_side="left",
+    return_tensors="pt",
 ).to(model.device, torch.float16)
 
 # Generate
 generate_ids = model.generate(**inputs, max_new_tokens=30)
 processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-['user\n\nWhat is shown in this image?\nassistant\nThere is a red stop sign in the image.\nuser\n\nWhat about this image? How many cats do you see?\nassistant\ntwo', 'user\n\nWhat is shown in this image?\nassistant\n']
+['user\n\nWhat is shown in this image?\nassistant\nThere is a red stop sign in the image.\nuser\n\nWhat about this image? How many cats do you see?\nassistant\ntwo', 'user\n\nWhat is shown in this image?\nassistant\nThe image shows a whimsical scene of a snowman sitting by a campfire. The snowman is anthropomorphized, wearing a hat and']
 ```
 
 ### Video inference
@@ -312,10 +313,6 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
 
 [[autodoc]] LlavaOnevisionVideoProcessor
 
-## LlavaOnevisionVideoProcessor
-
-[[autodoc]] LlavaOnevisionVideoProcessor
-
 ## LlavaOnevisionModel
 
 [[autodoc]] LlavaOnevisionModel

diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -682,6 +682,7 @@ def preprocess(
 
         if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
             # if the first element is a list, we assume that all elements are lists
+            images = [x for x in images if x]  # handle text-only case
             batch_num_images = [len(x) for x in images]
         elif isinstance(images, (tuple, list)):
             # treat this as a single-image case for backward compatibility

diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -394,7 +394,6 @@ def pack_image_features(self, image_features, image_sizes, image_newline=None, v
                 image_feature = image_feature[0]
                 if image_newline is not None:
                     image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
-                image_feature = image_feature.flatten(0, 1)
             new_image_features.append(image_feature)
             feature_lens.append(image_feature.size(0))
         feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)

diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -320,7 +320,6 @@ def pack_image_features(self, image_features, image_sizes, image_newline=None, v
                 image_feature = image_feature[0]
                 if image_newline is not None:
                     image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
-                image_feature = image_feature.flatten(0, 1)
             new_image_features.append(image_feature)
             feature_lens.append(image_feature.size(0))
         feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)

diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -215,14 +215,15 @@ def _expand_image_tokens(
         max_num_vision_tokens = 0
         for sample in text:
             if special_token in sample:
-                is_multi_image = next(batch_num_images) != 1
+                num_images = next(batch_num_images)  # should consume iterable
+                is_multi_image = num_images != 1
             else:
                 is_multi_image = False
             while special_token in sample:
+                original_size = next(image_sizes)  # should consume iterable
                 if is_multi_image:
                     num_image_tokens = self.num_image_tokens + 1  # one for image_newline
                 else:
-                    original_size = next(image_sizes)
                     if not isinstance(original_size, (list, tuple)):
                         # cast to list to avoid numerical precision errors when calculating unpadding
                         original_size = original_size.tolist()

diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -446,20 +446,25 @@ def test_small_model_integration_test_multi_image_nested(self):
 
         url = "https://www.ilankelman.org/stopsigns/australia.jpg"
         image = Image.open(requests.get(url, stream=True).raw)
-        prompt = (
-            "user\n<image><image>\nWhat is the difference between these images?<|im_end|>\n<|im_start|>assistant\n"
-        )
-        images_nested = [[self.image, image]]
-        inputs = self.processor(text=prompt, images=images_nested, return_tensors="pt").to(torch_device, torch.float16)
+        prompts = [
+            "user\nTell me about the french revolution.<|im_end|>\n<|im_start|>assistant\n",  # text-only case
+            "user\n<image><image>\nWhat is the difference between these images?<|im_end|>\n<|im_start|>assistant\n",
+            self.prompt_image,
+        ]
+        images_nested = [[], [image, self.image], [self.image]]
+        inputs = self.processor(
+            text=prompts,
+            images=images_nested,
+            return_tensors="pt",
+            padding=True,
+        ).to(torch_device, torch.float16)
 
         # verify generation
         output = model.generate(**inputs, max_new_tokens=40)
-        EXPECTED_DECODED_TEXT = "user\n\nWhat is the difference between these images?\nassistant\nThe first image is a radar chart showing the performance of different models in a specific task, while the second image is a street scene with a stop sign in the foreground."  # fmt: skip
+        EXPECTED_DECODED_TEXT = ["user\nTell me about the french revolution.\nassistant\nThe French Revolution! A pivotal event in modern history that had a profound impact on the course of Western civilization. Here's a brief overview:\n\n**Background**\n\nIn the late 18th century,", "user\n\nWhat is the difference between these images?\nassistant\nThe first image shows a stop sign with a traditional Chinese architectural background, while the second image displays a radar chart with various algorithms and models, including BLIP-2, InstructBLIP, Q", "user\n\nWhat do you see in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into several axes, each representing a different"]  # fmt: skip
+        DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True)
 
-        self.assertEqual(
-            self.processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
+        self.assertListEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT)
 
     @slow
     @require_bitsandbytes