From b36137b0583af704ea40c1d2df68af91c9d5d3bc Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 26 May 2025 16:37:04 +0800 Subject: [PATCH 1/7] Update processing_qwen2_5_vl.py --- src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index 0e9e064ecdda..b0c44f2c4514 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -143,16 +143,19 @@ def __call__( **kwargs, ) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + image_inputs = videos_inputs = {} if images is not None: image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) image_grid_thw = image_inputs["image_grid_thw"] if videos is not None: + fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) + videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] - fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) if isinstance(fps, (int, float)): second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw) elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw): @@ -187,7 +190,6 @@ def __call__( index += 1 text[i] = text[i].replace("<|placeholder|>", self.video_token) - return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) From 5a551a0b21f8b0f50a461adc94ba553f267a09bf Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 26 May 2025 16:43:14 +0800 Subject: [PATCH 2/7] Update processing_qwen2_5_vl.py --- src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index b0c44f2c4514..46f209b29716 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -143,8 +143,6 @@ def __call__( **kwargs, ) - return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) - image_inputs = videos_inputs = {} if images is not None: image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) @@ -152,6 +150,7 @@ def __call__( if videos is not None: fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) + output_kwargs["videos_kwargs"].pop("return_tensors", None) videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] @@ -190,6 +189,7 @@ def __call__( index += 1 text[i] = text[i].replace("<|placeholder|>", self.video_token) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) From d31e922e2fe9aae164781a398afb8d1ae79434ad Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 26 May 2025 16:54:59 +0800 Subject: [PATCH 3/7] Update modular_qwen2_5_vl.py --- src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index b4307161bd78..0cf6f0e0cc9a 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -1011,10 +1011,13 @@ def __call__( image_grid_thw = image_inputs["image_grid_thw"] if videos is not None: + # pop unexpected keys here for passing kwargs validation + fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) + output_kwargs["videos_kwargs"].pop("return_tensors", None) + videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] - fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) if isinstance(fps, (int, float)): second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw) elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw): From 223763bb842356faf98b0cc363789670e420c11a Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 26 May 2025 16:55:28 +0800 Subject: [PATCH 4/7] Fix CI --- src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index 46f209b29716..0e9e064ecdda 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -149,12 +149,10 @@ def __call__( image_grid_thw = image_inputs["image_grid_thw"] if videos is not None: - fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) - output_kwargs["videos_kwargs"].pop("return_tensors", None) - videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] + fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) if isinstance(fps, (int, float)): second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw) elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw): From 1073b3c1931ef337505570967d2727516d212d47 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 26 May 2025 17:15:58 +0800 Subject: [PATCH 5/7] Update modular_qwen2_5_vl.py --- src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 0cf6f0e0cc9a..9ac44b729496 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -1011,9 +1011,8 @@ def __call__( image_grid_thw = image_inputs["image_grid_thw"] if videos is not None: - # pop unexpected keys here for passing kwargs validation + # pop fps in advance for passing kwargs validation fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) - output_kwargs["videos_kwargs"].pop("return_tensors", None) videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] From 22e30783e3389e99a23d2e8ef7202a0d7ce4ab76 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 26 May 2025 17:16:22 +0800 Subject: [PATCH 6/7] Update processing_qwen2_5_vl.py --- src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index 0e9e064ecdda..0c69e370651b 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -149,10 +149,12 @@ def __call__( image_grid_thw = image_inputs["image_grid_thw"] if videos is not None: + # pop fps in advance for passing kwargs validation + fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) + videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) video_grid_thw = videos_inputs["video_grid_thw"] - fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) if isinstance(fps, (int, float)): second_per_grid_ts = [self.video_processor.temporal_patch_size / fps] * len(video_grid_thw) elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw): From 9894fb5af853b1c8fe0ac88659f7ad0995fbe61d Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Mon, 26 May 2025 18:00:25 +0800 Subject: [PATCH 7/7] Update video_processing_utils.py --- src/transformers/video_processing_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/video_processing_utils.py b/src/transformers/video_processing_utils.py index c55e3944b4ea..122bc89784ae 100644 --- a/src/transformers/video_processing_utils.py +++ b/src/transformers/video_processing_utils.py @@ -252,7 +252,10 @@ def preprocess( videos: VideoInput, **kwargs: Unpack[VideosKwargs], ) -> BatchFeature: - validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys()) + validate_kwargs( + captured_kwargs=kwargs.keys(), + valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"], + ) # Set default kwargs from self. This ensures that if a kwarg is not provided # by the user, it gets its default value from the instance, or is set to None. for kwarg_name in self.valid_kwargs.__annotations__: