diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py index d8d0cc11ffa5..19fe40874868 100644 --- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py @@ -143,8 +143,32 @@ def __call__( **kwargs, ) if images is not None: - image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) - image_grid_thw = image_inputs["image_grid_thw"] + # Preserve per-sample image grouping when a nested list of images is provided + if isinstance(images, (list, tuple)) and len(images) > 0 and isinstance(images[0], (list, tuple)): + per_sample_inputs = [ + self.image_processor(images=imgs, **output_kwargs["images_kwargs"]) for imgs in images + ] + per_sample_pixel_values = [ps["pixel_values"] for ps in per_sample_inputs] + # Concatenate image_grid_thw across samples for compatibility with text token placeholder logic + image_grid_thw = [] + for ps in per_sample_inputs: + image_grid_thw.extend(ps.get("image_grid_thw", [])) + + # Zero-pad along image dimension to the max number of images in the batch, then stack batch-first + max_n = max(p.shape[0] for p in per_sample_pixel_values) if len(per_sample_pixel_values) > 0 else 0 + padded = [] + for p in per_sample_pixel_values: + if p.shape[0] < max_n: + pad_shape = (max_n - p.shape[0],) + p.shape[1:] + pad = np.zeros(pad_shape, dtype=p.dtype) + p = np.concatenate([p, pad], axis=0) + padded.append(p) + # Final shape: [B, max_n, ...] + pixel_values = np.stack(padded, axis=0) if max_n > 0 else np.zeros((0,), dtype=np.float32) + image_inputs = {"pixel_values": pixel_values, "image_grid_thw": image_grid_thw} + else: + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + image_grid_thw = image_inputs["image_grid_thw"] else: image_inputs = {} image_grid_thw = None diff --git a/tests/models/qwen3_vl/test_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_processing_qwen3_vl.py index 9ce056a207ac..3579e2b6ee47 100644 --- a/tests/models/qwen3_vl/test_processing_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_processing_qwen3_vl.py @@ -148,6 +148,25 @@ def test_model_input_names(self): self.assertSetEqual(set(inputs.keys()), set(processor.model_input_names)) + @require_vision + @require_torch + @require_torchvision + def test_multiple_images_per_sample_preserves_batch(self): + # Build a processor from the small tmp pretrained saved in setUpClass + processor = self.get_processor() + # Create two samples: first has 2 images, second has 1 image + img1 = np.zeros((224, 224, 3), dtype=np.uint8) + img2 = np.zeros((224, 224, 3), dtype=np.uint8) + images = [[img1, img2], [img1]] + text = ["caption one", "caption two"] + + inputs = processor(images=images, text=text, return_tensors="np", padding=True) + pixel_values = inputs["pixel_values"] + + # Should preserve batch dimension (batch-first) and return an ndarray when tensors='np' + self.assertIsInstance(pixel_values, np.ndarray) + self.assertEqual(pixel_values.shape[0], len(images)) + @require_torch @require_av def _test_apply_chat_template(