From f9b270a2f896d420f75e935a79a191bb85015116 Mon Sep 17 00:00:00 2001 From: Rohan Kulkarni - Personal Date: Sat, 14 Mar 2026 13:10:20 -0700 Subject: [PATCH] [ColQwen2] Refactor output tracing (issue #43979) Applies the output tracing refactor to ColQwen2ForRetrieval as part of the broader effort tracked in issue #43979 to modernize output handling across all models in the library. Changes in both modular_colqwen2.py and modeling_colqwen2.py: - Add TransformersKwargs to imports; add Unpack import in modeling file - Remove explicit output_attentions, output_hidden_states, and return_dict params from ColQwen2ForRetrieval.forward() -- these are now captured via **kwargs: Unpack[TransformersKwargs] - Remove manual config-resolution boilerplate for those three flags - Pop output_hidden_states from kwargs and resolve from config as fallback (matching the pattern established in ColPali parent) - Always pass output_hidden_states=True to self.vlm.model() so hidden states are always collected internally; only returned to the caller when explicitly requested - Spread **kwargs into self.vlm.model() so output_attentions and other flags flow through naturally to the underlying Qwen2VL model, which already handles output capturing via @capture_outputs and @merge_with_config_defaults The @can_return_tuple decorator (already present) continues to handle the return_dict=False tuple-conversion path. Co-Authored-By: Claude Sonnet 4.6 --- .../models/colqwen2/modeling_colqwen2.py | 22 +++++++------------ .../models/colqwen2/modular_colqwen2.py | 21 ++++++------------ 2 files changed, 15 insertions(+), 28 deletions(-) diff --git a/src/transformers/models/colqwen2/modeling_colqwen2.py b/src/transformers/models/colqwen2/modeling_colqwen2.py index df95aa5fbe53..4e3b0f08a441 100644 --- a/src/transformers/models/colqwen2/modeling_colqwen2.py +++ b/src/transformers/models/colqwen2/modeling_colqwen2.py @@ -27,7 +27,8 @@ from ... import initialization as init from ...cache_utils import Cache from ...modeling_utils import PreTrainedModel -from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available +from ...processing_utils import Unpack +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available from .configuration_colqwen2 import ColQwen2Config @@ -133,12 +134,9 @@ def forward( labels: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, pixel_values: torch.Tensor | None = None, image_grid_thw: torch.LongTensor | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> ColQwen2ForRetrievalOutput: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): @@ -152,12 +150,9 @@ def forward( mask = arange.unsqueeze(0) < offsets.unsqueeze(1) # (batch_size, max_len) pixel_values = pixel_values[mask] # (total_valid_patches, channels, height, width) - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = kwargs.pop("output_hidden_states", None) + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states # Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs. if inputs_embeds is None: @@ -180,9 +175,8 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_hidden_states=True, + **kwargs, ) vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py index d28367a45857..69ac33d4ab06 100644 --- a/src/transformers/models/colqwen2/modular_colqwen2.py +++ b/src/transformers/models/colqwen2/modular_colqwen2.py @@ -19,7 +19,7 @@ from ...image_utils import ImageInput, is_valid_image from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available, logging +from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, logging from ..colpali.modeling_colpali import ColPaliForRetrieval, ColPaliPreTrainedModel from ..colpali.processing_colpali import ColPaliProcessor from .configuration_colqwen2 import ColQwen2Config @@ -274,12 +274,9 @@ def forward( labels: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, pixel_values: torch.Tensor | None = None, image_grid_thw: torch.LongTensor | None = None, - **kwargs, + **kwargs: Unpack[TransformersKwargs], ) -> ColQwen2ForRetrievalOutput: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): @@ -293,12 +290,9 @@ def forward( mask = arange.unsqueeze(0) < offsets.unsqueeze(1) # (batch_size, max_len) pixel_values = pixel_values[mask] # (total_valid_patches, channels, height, width) - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = kwargs.pop("output_hidden_states", None) + if output_hidden_states is None: + output_hidden_states = self.config.output_hidden_states # Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs. if inputs_embeds is None: @@ -321,9 +315,8 @@ def forward( past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_hidden_states=True, + **kwargs, ) vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None