Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 8 additions & 14 deletions src/transformers/models/colqwen2/modeling_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
from ... import initialization as init
from ...cache_utils import Cache
from ...modeling_utils import PreTrainedModel
from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available
from ...processing_utils import Unpack
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available
from .configuration_colqwen2 import ColQwen2Config


Expand Down Expand Up @@ -133,12 +134,9 @@ def forward(
labels: torch.LongTensor | None = None,
inputs_embeds: torch.FloatTensor | None = None,
use_cache: bool | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
pixel_values: torch.Tensor | None = None,
image_grid_thw: torch.LongTensor | None = None,
**kwargs,
**kwargs: Unpack[TransformersKwargs],
) -> ColQwen2ForRetrievalOutput:
r"""
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
Expand All @@ -152,12 +150,9 @@ def forward(
mask = arange.unsqueeze(0) < offsets.unsqueeze(1) # (batch_size, max_len)
pixel_values = pixel_values[mask] # (total_valid_patches, channels, height, width)

output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = kwargs.pop("output_hidden_states", None)
if output_hidden_states is None:
output_hidden_states = self.config.output_hidden_states

# Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs.
if inputs_embeds is None:
Expand All @@ -180,9 +175,8 @@ def forward(
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
output_hidden_states=True,
**kwargs,
)

vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None
Expand Down
21 changes: 7 additions & 14 deletions src/transformers/models/colqwen2/modular_colqwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from ...image_utils import ImageInput, is_valid_image
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
from ...tokenization_utils_base import PreTokenizedInput, TextInput
from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available, logging
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, logging
from ..colpali.modeling_colpali import ColPaliForRetrieval, ColPaliPreTrainedModel
from ..colpali.processing_colpali import ColPaliProcessor
from .configuration_colqwen2 import ColQwen2Config
Expand Down Expand Up @@ -274,12 +274,9 @@ def forward(
labels: torch.LongTensor | None = None,
inputs_embeds: torch.FloatTensor | None = None,
use_cache: bool | None = None,
output_attentions: bool | None = None,
output_hidden_states: bool | None = None,
return_dict: bool | None = None,
pixel_values: torch.Tensor | None = None,
image_grid_thw: torch.LongTensor | None = None,
**kwargs,
**kwargs: Unpack[TransformersKwargs],
) -> ColQwen2ForRetrievalOutput:
r"""
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
Expand All @@ -293,12 +290,9 @@ def forward(
mask = arange.unsqueeze(0) < offsets.unsqueeze(1) # (batch_size, max_len)
pixel_values = pixel_values[mask] # (total_valid_patches, channels, height, width)

output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
output_hidden_states = kwargs.pop("output_hidden_states", None)
if output_hidden_states is None:
output_hidden_states = self.config.output_hidden_states

# Custom data preparation to fix an issue with the gradient flow when training with multiple GPUs.
if inputs_embeds is None:
Expand All @@ -321,9 +315,8 @@ def forward(
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
output_hidden_states=True,
**kwargs,
)

vlm_hidden_states = vlm_output.hidden_states if output_hidden_states else None
Expand Down
Loading