huggingface · Elon7069 · Oct 18, 2025
diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
@@ -143,8 +143,32 @@ def __call__(
             **kwargs,
         )
         if images is not None:
-            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-            image_grid_thw = image_inputs["image_grid_thw"]
+            # Preserve per-sample image grouping when a nested list of images is provided
+            if isinstance(images, (list, tuple)) and len(images) > 0 and isinstance(images[0], (list, tuple)):
+                per_sample_inputs = [
+                    self.image_processor(images=imgs, **output_kwargs["images_kwargs"]) for imgs in images
+                ]
+                per_sample_pixel_values = [ps["pixel_values"] for ps in per_sample_inputs]
+                # Concatenate image_grid_thw across samples for compatibility with text token placeholder logic
+                image_grid_thw = []
+                for ps in per_sample_inputs:
+                    image_grid_thw.extend(ps.get("image_grid_thw", []))
+
+                # Zero-pad along image dimension to the max number of images in the batch, then stack batch-first
+                max_n = max(p.shape[0] for p in per_sample_pixel_values) if len(per_sample_pixel_values) > 0 else 0
+                padded = []
+                for p in per_sample_pixel_values:
+                    if p.shape[0] < max_n:
+                        pad_shape = (max_n - p.shape[0],) + p.shape[1:]
+                        pad = np.zeros(pad_shape, dtype=p.dtype)
+                        p = np.concatenate([p, pad], axis=0)
+                    padded.append(p)
+                # Final shape: [B, max_n, ...]
+                pixel_values = np.stack(padded, axis=0) if max_n > 0 else np.zeros((0,), dtype=np.float32)
+                image_inputs = {"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}
+            else:
+                image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+                image_grid_thw = image_inputs["image_grid_thw"]
         else:
             image_inputs = {}
             image_grid_thw = None

diff --git a/tests/models/qwen3_vl/test_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_processing_qwen3_vl.py
@@ -148,6 +148,25 @@ def test_model_input_names(self):
 
         self.assertSetEqual(set(inputs.keys()), set(processor.model_input_names))
 
+    @require_vision
+    @require_torch
+    @require_torchvision
+    def test_multiple_images_per_sample_preserves_batch(self):
+        # Build a processor from the small tmp pretrained saved in setUpClass
+        processor = self.get_processor()
+        # Create two samples: first has 2 images, second has 1 image
+        img1 = np.zeros((224, 224, 3), dtype=np.uint8)
+        img2 = np.zeros((224, 224, 3), dtype=np.uint8)
+        images = [[img1, img2], [img1]]
+        text = ["caption one", "caption two"]
+
+        inputs = processor(images=images, text=text, return_tensors="np", padding=True)
+        pixel_values = inputs["pixel_values"]
+
+        # Should preserve batch dimension (batch-first) and return an ndarray when tensors='np'
+        self.assertIsInstance(pixel_values, np.ndarray)
+        self.assertEqual(pixel_values.shape[0], len(images))
+
     @require_torch
     @require_av
     def _test_apply_chat_template(