diff --git a/src/transformers/models/colqwen2/modeling_colqwen2.py b/src/transformers/models/colqwen2/modeling_colqwen2.py index df95aa5fbe53..4e3b0f08a441 100644 --- a/src/transformers/models/colqwen2/modeling_colqwen2.py +++ b/src/transformers/models/colqwen2/modeling_colqwen2.py @@ -27,7 +27,8 @@ from ... import initialization as init from ...cache_utils import Cache from ...modeling_utils import PreTrainedModel -from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available +from ...processing_utils import Unpack +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available from .configuration_colqwen2 import ColQwen2Config @@ -133,12 +134,9 @@ def forward( labels: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, pixel_values: torch.Tensor | None = None, image_grid_thw: torch.LongTensor | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> ColQwen2ForRetrievalOutput: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): @@ -152,12 +150,9 @@ def forward( mask = arange.unsqueeze(0) < offsets.unsqueeze(1) # (batch_size, max_len) pixel_values = pixel_values[mask] # (total_valid_patches, channels, height, width) - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = kwargs.pop("output_hidden_states", None) + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states # Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs. if inputs_embeds is None: @@ -180,9 +175,8 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_hidden_states=True, + **kwargs, ) vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py index d28367a45857..69ac33d4ab06 100644 --- a/src/transformers/models/colqwen2/modular_colqwen2.py +++ b/src/transformers/models/colqwen2/modular_colqwen2.py @@ -19,7 +19,7 @@ from ...image_utils import ImageInput, is_valid_image from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available, logging +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, logging from ..colpali.modeling_colpali import ColPaliForRetrieval, ColPaliPreTrainedModel from ..colpali.processing_colpali import ColPaliProcessor from .configuration_colqwen2 import ColQwen2Config @@ -274,12 +274,9 @@ def forward( labels: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, pixel_values: torch.Tensor | None = None, image_grid_thw: torch.LongTensor | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> ColQwen2ForRetrievalOutput: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): @@ -293,12 +290,9 @@ def forward( mask = arange.unsqueeze(0) < offsets.unsqueeze(1) # (batch_size, max_len) pixel_values = pixel_values[mask] # (total_valid_patches, channels, height, width) - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = kwargs.pop("output_hidden_states", None) + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states # Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs. if inputs_embeds is None: @@ -321,9 +315,8 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_hidden_states=True, + **kwargs, ) vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None