diff --git a/.circleci/config.yml b/.circleci/config.yml index ab63a3823c2f..7c0da230b727 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -190,6 +190,7 @@ jobs: - run: python utils/check_config_docstrings.py - run: python utils/check_config_attributes.py - run: python utils/check_doctest_list.py + - run: python utils/check_decorator_return_types.py - run: python utils/update_metadata.py --check-only - run: python utils/add_dates.py --check-only - run: > diff --git a/.github/workflows/pr-repo-consistency-bot.yml b/.github/workflows/pr-repo-consistency-bot.yml index 2d793d822713..3ee58cebf560 100644 --- a/.github/workflows/pr-repo-consistency-bot.yml +++ b/.github/workflows/pr-repo-consistency-bot.yml @@ -170,6 +170,7 @@ jobs: cp utils/check_pipeline_typing.py pr-repo/utils/check_pipeline_typing.py cp utils/check_doctest_list.py pr-repo/utils/check_doctest_list.py cp utils/check_docstrings.py pr-repo/utils/check_docstrings.py + cp utils/check_decorator_return_types.py pr-repo/utils/check_decorator_return_types.py cp utils/add_dates.py pr-repo/utils/add_dates.py - name: Run repo consistency checks with trusted script @@ -197,6 +198,7 @@ jobs: python utils/check_pipeline_typing.py --fix_and_overwrite python utils/check_doctest_list.py --fix_and_overwrite python utils/check_docstrings.py --fix_and_overwrite + python utils/check_decorator_return_types.py --fix_and_overwrite python utils/add_dates.py # Check if there are changes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index be570fc0a1f1..9beefc7bb210 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -193,7 +193,7 @@ The library has 400+ models with many established patterns: - Search for similar models (e.g., other vision-language models) - Reuse attention mechanisms, layer implementations, and processing patterns - Check models like LLaVA, Idefics2, Fuyu for vision-language patterns -- Use provided decorators like (`auto_docstring`, `can_return_tuple`, `check_model_inputs` and `_can_record_outputs`) where relevant. +- Use provided decorators like (`auto_docstring`, `can_return_tuple`, `capture_outputs` and `_can_record_outputs`) where relevant. - Don't reinvent the wheel ☐ **7. Run quality checks and read the output** diff --git a/Makefile b/Makefile index ba78e2a4d461..629c30c315c8 100644 --- a/Makefile +++ b/Makefile @@ -41,6 +41,7 @@ check-repo: -python utils/check_config_docstrings.py -python utils/check_config_attributes.py -python utils/check_doctest_list.py + -python utils/check_decorator_return_types.py -python utils/update_metadata.py --check-only -python utils/add_dates.py --check-only -@{ \ @@ -62,6 +63,7 @@ fix-repo: style -python utils/check_pipeline_typing.py --fix_and_overwrite -python utils/check_doctest_list.py --fix_and_overwrite -python utils/check_docstrings.py --fix_and_overwrite + -python utils/check_decorator_return_types.py --fix_and_overwrite -python utils/add_dates.py diff --git a/src/transformers/models/afmoe/modeling_afmoe.py b/src/transformers/models/afmoe/modeling_afmoe.py index 925b548af232..187f532e0207 100644 --- a/src/transformers/models/afmoe/modeling_afmoe.py +++ b/src/transformers/models/afmoe/modeling_afmoe.py @@ -572,7 +572,7 @@ def forward( cache_position: torch.LongTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/afmoe/modular_afmoe.py b/src/transformers/models/afmoe/modular_afmoe.py index c5e7eb1faef1..08a2864fb05c 100644 --- a/src/transformers/models/afmoe/modular_afmoe.py +++ b/src/transformers/models/afmoe/modular_afmoe.py @@ -394,7 +394,7 @@ def forward( cache_position: torch.LongTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py index 5b6630f2e54d..392c43cb9f0d 100644 --- a/src/transformers/models/aimv2/modeling_aimv2.py +++ b/src/transformers/models/aimv2/modeling_aimv2.py @@ -616,7 +616,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -651,7 +651,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 890f18316b6b..eb92fc87152a 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -392,7 +392,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> BaseModelOutputWithPooling | tuple: + ) -> BaseModelOutputWithPooling: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -466,7 +466,7 @@ def forward( labels: torch.LongTensor | None = None, sentence_order_label: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> AlbertForPreTrainingOutput | tuple: + ) -> AlbertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -595,7 +595,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> MaskedLMOutput | tuple: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -687,7 +687,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> SequenceClassifierOutput | tuple: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -769,7 +769,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> TokenClassifierOutput | tuple: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -826,7 +826,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> AlbertForPreTrainingOutput | tuple: + ) -> AlbertForPreTrainingOutput: outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, @@ -893,7 +893,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> AlbertForPreTrainingOutput | tuple: + ) -> AlbertForPreTrainingOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index e8c1e7ba4ae9..962ad6de9989 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -771,7 +771,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -897,7 +897,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1010,7 +1010,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: + ) -> BaseModelOutputWithPoolingAndNoAttention: r""" Examples: @@ -1104,7 +1104,7 @@ def get_text_features( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1137,7 +1137,7 @@ def get_text_features( @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1173,7 +1173,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AlignOutput: + ) -> AlignOutput: r""" return_loss (`bool`, *optional*): Whether or not to return the contrastive loss. diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 769edaec72ae..c8b1c1532bbc 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -384,7 +384,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -607,7 +607,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -825,7 +825,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, interpolate_pos_encoding: bool | None = False, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -949,6 +949,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + @can_return_tuple @auto_docstring # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward def forward( @@ -960,14 +961,12 @@ def forward( inputs_embeds: torch.Tensor | None = None, output_attentions: bool | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1054,7 +1053,7 @@ def forward( return_dict: bool | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPoolingAndProjection: + ) -> BaseModelOutputWithPoolingAndProjection: r""" Examples: @@ -1149,7 +1148,7 @@ def get_text_features( position_ids: torch.Tensor | None = None, token_type_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1184,7 +1183,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 28685ed76d10..72006d37fe55 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -928,7 +928,7 @@ def get_image_features( vision_feature_layer: int = -1, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: patch_attention_mask = self._create_patch_attention_mask(pixel_mask) image_outputs = self.vision_tower( pixel_values, @@ -985,7 +985,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | AriaModelOutputWithPast: + ) -> AriaModelOutputWithPast: if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) @@ -1101,7 +1101,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AriaCausalLMOutputWithPast: + ) -> AriaCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 07bf00d04569..608e639fb5c6 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -1266,7 +1266,7 @@ def get_image_features( vision_feature_layer: int = -1, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: patch_attention_mask = self._create_patch_attention_mask(pixel_mask) image_outputs = self.vision_tower( pixel_values, @@ -1297,7 +1297,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | AriaModelOutputWithPast: + ) -> AriaModelOutputWithPast: if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) @@ -1376,7 +1376,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AriaCausalLMOutputWithPast: + ) -> AriaCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py index f635206721db..eb8b94da0adf 100644 --- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py @@ -329,7 +329,7 @@ def forward( input_features: torch.Tensor, input_features_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Args: input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): @@ -457,7 +457,7 @@ def get_audio_features( input_features: torch.FloatTensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py index 4efa10c7f702..ebb0bfdde457 100644 --- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py @@ -70,7 +70,7 @@ def forward( input_features: torch.Tensor, input_features_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Args: input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): @@ -158,7 +158,7 @@ def get_audio_features( input_features: torch.FloatTensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py index c214332f00b5..2f655e753334 100644 --- a/src/transformers/models/aya_vision/modeling_aya_vision.py +++ b/src/transformers/models/aya_vision/modeling_aya_vision.py @@ -191,7 +191,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -257,7 +257,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AyaVisionModelOutputWithPast: + ) -> AyaVisionModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -357,7 +357,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AyaVisionCausalLMOutputWithPast: + ) -> AyaVisionCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/aya_vision/modular_aya_vision.py b/src/transformers/models/aya_vision/modular_aya_vision.py index c09460d3a473..da441333021f 100644 --- a/src/transformers/models/aya_vision/modular_aya_vision.py +++ b/src/transformers/models/aya_vision/modular_aya_vision.py @@ -116,7 +116,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -158,7 +158,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AyaVisionModelOutputWithPast: + ) -> AyaVisionModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -213,7 +213,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AyaVisionCausalLMOutputWithPast: + ) -> AyaVisionCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 837a083f283b..491310b2dbeb 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -650,7 +650,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -789,7 +789,7 @@ def forward( labels: torch.Tensor | None = None, next_sentence_label: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BertForPreTrainingOutput: + ) -> BertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -894,7 +894,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -980,7 +980,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1041,7 +1041,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | NextSentencePredictorOutput: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -1129,7 +1129,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1208,7 +1208,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1307,7 +1307,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1364,7 +1364,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.bert( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 5ddff5871a67..7f53310b7e6d 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -519,7 +519,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -674,7 +674,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 2325cbb7bcbe..10baae75afeb 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -488,7 +488,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -572,7 +572,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -604,7 +604,7 @@ def get_image_features( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -699,7 +699,7 @@ def forward( return_loss: bool | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BlipOutput: + ) -> BlipOutput: r""" return_loss (`bool`, *optional*): Whether or not to return the contrastive loss. @@ -817,7 +817,7 @@ def forward( interpolate_pos_encoding: bool = False, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BlipForConditionalGenerationModelOutput: + ) -> BlipForConditionalGenerationModelOutput: r""" Examples: @@ -992,7 +992,7 @@ def forward( labels: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BlipTextVisionModelOutput: + ) -> BlipTextVisionModelOutput: r""" Examples: @@ -1224,7 +1224,7 @@ def forward( attention_mask: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BlipTextVisionModelOutput: + ) -> BlipTextVisionModelOutput: r""" use_itm_head (`bool`, *optional*, defaults to `True`): Whether or not to use the image-text matching head. diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 16dad19c4b23..80b019182910 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -508,7 +508,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -808,7 +808,7 @@ def forward( encoder_attention_mask=None, query_length=0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithPastAndCrossAttentions: for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] @@ -965,7 +965,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" query_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Hidden states to be used in the attention computation. If cross-attention, @@ -1093,7 +1093,7 @@ def get_text_features( decoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -1152,7 +1152,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: ```python @@ -1253,7 +1253,7 @@ def forward( labels: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Blip2ForConditionalGenerationModelOutput: + ) -> Blip2ForConditionalGenerationModelOutput: r""" decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also @@ -1400,7 +1400,7 @@ def forward( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Blip2TextModelOutput: + ) -> Blip2TextModelOutput: r""" Examples: @@ -1482,7 +1482,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Blip2VisionModelOutput: + ) -> Blip2VisionModelOutput: r""" Examples: @@ -1626,7 +1626,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisionQformerOutputs: + ) -> BaseModelOutputWithVisionQformerOutputs: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1700,7 +1700,7 @@ def forward( labels: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Blip2ForConditionalGenerationModelOutput: + ) -> Blip2ForConditionalGenerationModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be diff --git a/src/transformers/models/blt/modeling_blt.py b/src/transformers/models/blt/modeling_blt.py index 0a1f8948c01b..983f3675c84d 100644 --- a/src/transformers/models/blt/modeling_blt.py +++ b/src/transformers/models/blt/modeling_blt.py @@ -1227,7 +1227,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1411,7 +1411,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" cross_attention_states (`torch.FloatTensor`, *optional*): Output of the vision model, used for cross-attention. This tensor contains the processed image features that diff --git a/src/transformers/models/blt/modular_blt.py b/src/transformers/models/blt/modular_blt.py index 92a45ae01ab4..539c99ef5670 100644 --- a/src/transformers/models/blt/modular_blt.py +++ b/src/transformers/models/blt/modular_blt.py @@ -947,7 +947,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1131,7 +1131,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" cross_attention_states (`torch.FloatTensor`, *optional*): Output of the vision model, used for cross-attention. This tensor contains the processed image features that diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 468175a9ea22..11b0e81483d8 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -1028,7 +1028,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py index 9b840fdcd5ab..e2cce8060118 100755 --- a/src/transformers/models/bros/modeling_bros.py +++ b/src/transformers/models/bros/modeling_bros.py @@ -423,7 +423,7 @@ def forward( output_attentions: bool | None = False, output_hidden_states: bool | None = False, return_dict: bool | None = True, - ) -> tuple[torch.Tensor] | BaseModelOutputWithCrossAttentions: + ) -> BaseModelOutputWithCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -570,7 +570,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values @@ -709,7 +709,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values @@ -830,7 +830,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BrosSpadeOutput: + ) -> BrosSpadeOutput: r""" bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values @@ -967,7 +967,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index 00973e39d04d..095ac4479aa3 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -629,7 +629,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -769,7 +769,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -865,7 +865,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -949,7 +949,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1050,7 +1050,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1118,7 +1118,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1219,7 +1219,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/camembert/modular_camembert.py b/src/transformers/models/camembert/modular_camembert.py index a7d98b334983..3e903ef9652d 100644 --- a/src/transformers/models/camembert/modular_camembert.py +++ b/src/transformers/models/camembert/modular_camembert.py @@ -74,7 +74,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -137,7 +137,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -216,7 +216,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -308,7 +308,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -371,7 +371,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -450,7 +450,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index fac0ef50a382..7b1fbfcae3b5 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -894,7 +894,7 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor): ) def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1082,7 +1082,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 09d5e8822b61..8c76be58c32c 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -631,7 +631,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -682,7 +682,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -749,7 +749,7 @@ def forward( output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = False, return_dict: bool | None = None, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -842,7 +842,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1011,7 +1011,7 @@ def get_text_features( token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1047,7 +1047,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1094,7 +1094,7 @@ def forward( interpolate_pos_encoding: bool = False, return_dict: bool | None = None, **kwargs, - ) -> tuple | ChineseCLIPOutput: + ) -> ChineseCLIPOutput: r""" return_loss (`bool`, *optional*): Whether or not to return the contrastive loss. diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index ba0b303b06c1..f43091b83950 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1259,7 +1259,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -1455,14 +1455,12 @@ def forward( inputs_embeds: torch.Tensor | None = None, output_attentions: bool | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1560,7 +1558,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1595,7 +1593,7 @@ def get_audio_features( is_longer: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance @@ -1637,7 +1635,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | ClapOutput: + ) -> ClapOutput: r""" is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance @@ -1751,7 +1749,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | ClapTextModelOutput: + ) -> ClapTextModelOutput: r""" Examples: @@ -1815,7 +1813,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | ClapAudioModelOutput: + ) -> ClapAudioModelOutput: r""" is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index e7540a8962ac..87fa9d35ce4e 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -805,7 +805,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -840,7 +840,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index e3e2dfdd611b..b29d033ea59e 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -489,7 +489,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -848,7 +848,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -882,7 +882,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index 3f7b4ee0cc38..862be10c8d4d 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -1516,7 +1516,7 @@ def get_text_features( text_encoder_inputs_embeds: torch.FloatTensor | None = None, attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | ClvpEncoderOutput: + ) -> ClvpEncoderOutput: r""" text_encoder_inputs_embeds (`torch.FloatTensor`, *optional*): inputs_embeds for the text encoder model passed in place of `input_ids`. diff --git a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py index 6ba33d437e1b..653b83182883 100644 --- a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py @@ -170,7 +170,7 @@ def set_input_embeddings(self, value): ) def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) selected_image_feature = image_outputs.last_hidden_state image_outputs.pooler_output = self.multi_modal_projector(selected_image_feature) @@ -214,7 +214,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Cohere2VisionModelOutputWithPast: + ) -> Cohere2VisionModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -294,7 +294,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Cohere2VisionCausalLMOutputWithPast: + ) -> Cohere2VisionCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py index 1681a0b0e7d0..0b393d17b0ee 100644 --- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py @@ -101,7 +101,7 @@ class Cohere2VisionModel(AyaVisionModel): ) def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) selected_image_feature = image_outputs.last_hidden_state image_outputs.pooler_output = self.multi_modal_projector(selected_image_feature) @@ -119,7 +119,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Cohere2VisionModelOutputWithPast: + ) -> Cohere2VisionModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -176,7 +176,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Cohere2VisionCausalLMOutputWithPast: + ) -> Cohere2VisionCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 2de83de19c12..6bf377619d40 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -1674,7 +1674,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | ConditionalDetrSegmentationOutput: + ) -> ConditionalDetrSegmentationOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py index 08af7f1f50d6..832f8076481f 100644 --- a/src/transformers/models/csm/modeling_csm.py +++ b/src/transformers/models/csm/modeling_csm.py @@ -454,7 +454,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*): The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model) @@ -587,7 +587,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*): The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model) @@ -964,7 +964,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CsmOutputWithPast: + ) -> CsmOutputWithPast: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`): 1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input diff --git a/src/transformers/models/csm/modular_csm.py b/src/transformers/models/csm/modular_csm.py index 8a21fdfee90f..d314d32b561a 100644 --- a/src/transformers/models/csm/modular_csm.py +++ b/src/transformers/models/csm/modular_csm.py @@ -174,7 +174,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*): The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model) @@ -327,7 +327,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*): The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model) @@ -614,7 +614,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CsmOutputWithPast: + ) -> CsmOutputWithPast: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`): 1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input diff --git a/src/transformers/models/d_fine/modeling_d_fine.py b/src/transformers/models/d_fine/modeling_d_fine.py index 1c758f8b1dcd..405d24cd8510 100644 --- a/src/transformers/models/d_fine/modeling_d_fine.py +++ b/src/transformers/models/d_fine/modeling_d_fine.py @@ -1627,7 +1627,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DFineModelOutput: + ) -> DFineModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1948,7 +1948,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DFineObjectDetectionOutput: + ) -> DFineObjectDetectionOutput: r""" Example: diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index c86d726439d6..0851a2e3c672 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -592,7 +592,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -781,7 +781,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -882,7 +882,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -947,7 +947,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1022,7 +1022,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1122,7 +1122,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1181,7 +1181,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.data2vec_text( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/data2vec/modular_data2vec_text.py b/src/transformers/models/data2vec/modular_data2vec_text.py index ac77a81841d9..eca25868b4a6 100644 --- a/src/transformers/models/data2vec/modular_data2vec_text.py +++ b/src/transformers/models/data2vec/modular_data2vec_text.py @@ -145,7 +145,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -246,7 +246,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -311,7 +311,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -386,7 +386,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -486,7 +486,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -545,7 +545,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.data2vec_text( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py index 8ac371dc2f81..cdcee5bd338a 100644 --- a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py @@ -157,7 +157,7 @@ def set_input_embeddings(self, value): @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_model(pixel_values, return_dict=True, **kwargs) vision_outputs.pooler_output = self.aligner(vision_outputs.last_hidden_state) diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py index 7f02b2d56b40..ffdcefa24dea 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py @@ -299,7 +299,7 @@ def get_image_features( pixel_values: torch.FloatTensor, high_res_pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithHighResVisionEncodings: + ) -> BaseModelOutputWithHighResVisionEncodings: low_res_outputs = self.get_low_res_image_features(pixel_values, **kwargs) high_res_outputs = self.get_high_res_image_features(high_res_pixel_values, **kwargs) image_features = self.aligner(low_res_outputs.last_hidden_state, high_res_outputs.last_hidden_state) diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index 42d97546b2e4..c49200f84c5e 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -318,7 +318,7 @@ def get_image_features( pixel_values: torch.FloatTensor, high_res_pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithHighResVisionEncodings: + ) -> BaseModelOutputWithHighResVisionEncodings: low_res_outputs = self.get_low_res_image_features(pixel_values, **kwargs) high_res_outputs = self.get_high_res_image_features(high_res_pixel_values, **kwargs) image_features = self.aligner(low_res_outputs.last_hidden_state, high_res_outputs.last_hidden_state) diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 3ee685a887c1..41fa7b40d950 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -1035,7 +1035,7 @@ def forward( level_start_index=None, valid_ratios=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> DeformableDetrDecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1312,7 +1312,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, decoder_inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DeformableDetrModelOutput: + ) -> DeformableDetrModelOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Not used by default. Can be used to mask object queries. @@ -1571,7 +1571,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DeformableDetrObjectDetectionOutput: + ) -> DeformableDetrObjectDetectionOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Not used by default. Can be used to mask object queries. diff --git a/src/transformers/models/deformable_detr/modular_deformable_detr.py b/src/transformers/models/deformable_detr/modular_deformable_detr.py index dfbc0783fb0a..213c5571edfd 100644 --- a/src/transformers/models/deformable_detr/modular_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modular_deformable_detr.py @@ -696,7 +696,7 @@ def forward( level_start_index=None, valid_ratios=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -817,7 +817,7 @@ def forward( level_start_index=None, valid_ratios=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> DeformableDetrDecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1094,7 +1094,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, decoder_inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DeformableDetrModelOutput: + ) -> DeformableDetrModelOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Not used by default. Can be used to mask object queries. @@ -1338,7 +1338,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DeformableDetrObjectDetectionOutput: + ) -> DeformableDetrObjectDetectionOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Not used by default. Can be used to mask object queries. diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 4906b3510f44..e29a17e3794b 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -1148,7 +1148,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, decoder_inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DetrModelOutput: + ) -> DetrModelOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`: @@ -1327,7 +1327,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DetrObjectDetectionOutput: + ) -> DetrObjectDetectionOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`: @@ -1488,7 +1488,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DetrSegmentationOutput: + ) -> DetrSegmentationOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/dia/modeling_dia.py b/src/transformers/models/dia/modeling_dia.py index f7ba0a3d4b81..e3c96e496e99 100644 --- a/src/transformers/models/dia/modeling_dia.py +++ b/src/transformers/models/dia/modeling_dia.py @@ -466,7 +466,7 @@ def forward( output_attentions: bool | None = False, output_hidden_states: bool | None = False, **kwargs: Unpack[FlashAttentionKwargs], - ) -> BaseModelOutput | tuple: + ) -> BaseModelOutput: hidden_states = self.embedding(input_ids) # RoPE @@ -599,7 +599,7 @@ def forward( output_hidden_states: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> BaseModelOutputWithPastAndCrossAttentions | tuple: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`): The original `decoder_input_ids` in 3D shape to facilitate more efficient computations. @@ -711,7 +711,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Seq2SeqModelOutput: + ) -> Seq2SeqModelOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) or (batch_size, target_sequence_length, num_codebooks)`, *optional*): @@ -848,7 +848,7 @@ def forward( labels: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) or (batch_size, target_sequence_length, num_codebooks)`, *optional*): diff --git a/src/transformers/models/dia/modular_dia.py b/src/transformers/models/dia/modular_dia.py index cbf74517e61c..b5e36236f95d 100644 --- a/src/transformers/models/dia/modular_dia.py +++ b/src/transformers/models/dia/modular_dia.py @@ -257,7 +257,7 @@ def forward( output_attentions: bool | None = False, output_hidden_states: bool | None = False, **kwargs: Unpack[FlashAttentionKwargs], - ) -> BaseModelOutput | tuple: + ) -> BaseModelOutput: hidden_states = self.embedding(input_ids) # RoPE @@ -390,7 +390,7 @@ def forward( output_hidden_states: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> BaseModelOutputWithPastAndCrossAttentions | tuple: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`): The original `decoder_input_ids` in 3D shape to facilitate more efficient computations. @@ -502,7 +502,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Seq2SeqModelOutput: + ) -> Seq2SeqModelOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) or (batch_size, target_sequence_length, num_codebooks)`, *optional*): @@ -639,7 +639,7 @@ def forward( labels: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) or (batch_size, target_sequence_length, num_codebooks)`, *optional*): diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 4696669df624..690e28d2db4b 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -393,7 +393,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> BaseModelOutput | tuple[torch.Tensor, ...]: + ) -> BaseModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): Indices of input sequence tokens in the vocabulary. @@ -484,7 +484,7 @@ def forward( labels: torch.LongTensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> MaskedLMOutput | tuple[torch.Tensor, ...]: + ) -> MaskedLMOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): Indices of input sequence tokens in the vocabulary. @@ -578,7 +578,7 @@ def forward( labels: torch.LongTensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> SequenceClassifierOutput | tuple[torch.Tensor, ...]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -677,7 +677,7 @@ def forward( end_positions: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> QuestionAnsweringModelOutput | tuple[torch.Tensor, ...]: + ) -> QuestionAnsweringModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): Indices of input sequence tokens in the vocabulary. @@ -776,7 +776,7 @@ def forward( labels: torch.LongTensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> TokenClassifierOutput | tuple[torch.Tensor, ...]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -851,7 +851,7 @@ def forward( labels: torch.LongTensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> MultipleChoiceModelOutput | tuple[torch.Tensor, ...]: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. diff --git a/src/transformers/models/edgetam/modeling_edgetam.py b/src/transformers/models/edgetam/modeling_edgetam.py index 099f9ca789c0..7f9a08323876 100644 --- a/src/transformers/models/edgetam/modeling_edgetam.py +++ b/src/transformers/models/edgetam/modeling_edgetam.py @@ -451,7 +451,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | EdgeTamVisionEncoderOutput: + ) -> EdgeTamVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1210,7 +1210,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | EdgeTamVisionEncoderOutput: + ) -> EdgeTamVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/edgetam/modular_edgetam.py b/src/transformers/models/edgetam/modular_edgetam.py index 648c78f96f2f..75ac7b05a3e6 100644 --- a/src/transformers/models/edgetam/modular_edgetam.py +++ b/src/transformers/models/edgetam/modular_edgetam.py @@ -255,7 +255,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | EdgeTamVisionEncoderOutput: + ) -> EdgeTamVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/edgetam_video/modeling_edgetam_video.py b/src/transformers/models/edgetam_video/modeling_edgetam_video.py index 89a72e6c88b5..3ca98625b04b 100644 --- a/src/transformers/models/edgetam_video/modeling_edgetam_video.py +++ b/src/transformers/models/edgetam_video/modeling_edgetam_video.py @@ -2235,7 +2235,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | EdgeTamVideoVisionEncoderOutput: + ) -> EdgeTamVideoVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 37864955468d..8018b1be9347 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -595,7 +595,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithCrossAttentions: + ) -> BaseModelOutputWithCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -845,7 +845,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -923,7 +923,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | ElectraForPreTrainingOutput: + ) -> ElectraForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring) @@ -1025,7 +1025,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1092,7 +1092,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1151,7 +1151,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: discriminator_hidden_states = self.electra( input_ids, attention_mask=attention_mask, @@ -1218,7 +1218,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1333,7 +1333,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 46d662025f5e..d6a35fc67978 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1406,7 +1406,7 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.L ) def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Emu3VQVAEModelOutput: + ) -> Emu3VQVAEModelOutput: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images. @@ -1485,7 +1485,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`): The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using @@ -1566,7 +1566,7 @@ def forward( labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`): The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py index e3c64ab770c1..5ed0f8a09fb4 100644 --- a/src/transformers/models/emu3/modular_emu3.py +++ b/src/transformers/models/emu3/modular_emu3.py @@ -960,7 +960,7 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.L ) def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Emu3VQVAEModelOutput: + ) -> Emu3VQVAEModelOutput: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images. @@ -1039,7 +1039,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`): The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using @@ -1120,7 +1120,7 @@ def forward( labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`): The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index 2df8cf56eb05..3f6f5895ff08 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -329,7 +329,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs, - ) -> tuple | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 014339899aff..db413e20b8f6 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -613,7 +613,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -809,7 +809,7 @@ def forward( labels: torch.Tensor | None = None, next_sentence_label: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | ErnieForPreTrainingOutput: + ) -> ErnieForPreTrainingOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -931,7 +931,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1024,7 +1024,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1102,7 +1102,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | NextSentencePredictorOutput: + ) -> NextSentencePredictorOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1197,7 +1197,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1283,7 +1283,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1389,7 +1389,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1453,7 +1453,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as diff --git a/src/transformers/models/ernie/modular_ernie.py b/src/transformers/models/ernie/modular_ernie.py index 5602008a3837..86a2f080c265 100644 --- a/src/transformers/models/ernie/modular_ernie.py +++ b/src/transformers/models/ernie/modular_ernie.py @@ -207,7 +207,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -349,7 +349,7 @@ def forward( labels: torch.Tensor | None = None, next_sentence_label: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | ErnieForPreTrainingOutput: + ) -> ErnieForPreTrainingOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -432,7 +432,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -502,7 +502,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -556,7 +556,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | NextSentencePredictorOutput: + ) -> NextSentencePredictorOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -630,7 +630,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -702,7 +702,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -793,7 +793,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -846,7 +846,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as diff --git a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py index 967d43107793..a91757ad381b 100644 --- a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py @@ -931,7 +931,7 @@ def rot_pos_emb(self, grid_thw): @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" grid_thw (`torch.LongTensor` of shape `(num_images, 3)`): The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values. @@ -1274,7 +1274,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1299,7 +1299,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1416,7 +1416,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*): Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2). @@ -1640,7 +1640,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*): Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2). diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py index 9ff6f4f50b1c..869b611608d7 100644 --- a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py @@ -892,7 +892,7 @@ def get_device(self): @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: hidden_states = self.patch_embed(hidden_states) rotary_pos_emb = self.rot_pos_emb(grid_thw) emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) @@ -1162,7 +1162,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: video_outputs = self.vision_tower(pixel_values_videos, video_grid_thw, return_dict=True, **kwargs) video_embeds = self.resampler_model(video_outputs.last_hidden_state, video_grid_thw) split_sizes = ( @@ -1181,7 +1181,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, image_grid_thw, return_dict=True, **kwargs) image_embeds = self.resampler_model(image_outputs.last_hidden_state, image_grid_thw) split_sizes = (image_grid_thw.prod(-1) // self.vision_tower.spatial_merge_size**2).tolist() @@ -1208,7 +1208,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*): Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2). @@ -1349,7 +1349,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*): Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2). diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index f454c3a7d8b4..649086899eb8 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -498,7 +498,7 @@ def forward( encoder_hidden_states=None, encoder_attention_mask=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithCrossAttentions: for i, layer_module in enumerate(self.layer): hidden_states = layer_module( hidden_states, @@ -621,7 +621,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`): Indices of input sequence tokens in the vocabulary. @@ -760,7 +760,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -846,7 +846,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -919,7 +919,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py index a3d3ea5ffbb5..940c91540194 100644 --- a/src/transformers/models/evolla/modeling_evolla.py +++ b/src/transformers/models/evolla/modeling_evolla.py @@ -470,7 +470,7 @@ def forward( encoder_hidden_states=None, encoder_attention_mask=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithCrossAttentions: for i, layer_module in enumerate(self.layer): hidden_states = layer_module( hidden_states, @@ -545,7 +545,7 @@ def forward( input_ids: torch.Tensor | None, attention_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: input_shape = input_ids.size() batch_size, seq_length = input_shape @@ -701,7 +701,9 @@ def __init__(self, config: EvollaConfig): self.sequence_compressor_resampler = EvollaSequenceCompressorResampler(config=config) @can_return_tuple - def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs): + def forward( + self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs + ) -> EvollaProteinEncoderModelOutput: protein_output = self.model(input_ids=input_ids, attention_mask=attention_mask) protein_embeds = protein_output.last_hidden_state sequence_repr = self.sequence_compressor_resampler(protein_embeds, attention_mask) @@ -1321,7 +1323,7 @@ def forward( structure_batch_mask: torch.Tensor | None = None, msa_batch_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" protein_input_ids (torch.LongTensor): The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`. @@ -1436,7 +1438,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ): + ) -> CausalLMOutputWithPast: r""" protein_input_ids (torch.LongTensor): The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`. diff --git a/src/transformers/models/evolla/modular_evolla.py b/src/transformers/models/evolla/modular_evolla.py index 6a6c67639eaf..56e2fa4ae6ac 100644 --- a/src/transformers/models/evolla/modular_evolla.py +++ b/src/transformers/models/evolla/modular_evolla.py @@ -230,7 +230,7 @@ def forward( input_ids: torch.Tensor | None, attention_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: input_shape = input_ids.size() batch_size, seq_length = input_shape @@ -386,7 +386,9 @@ def __init__(self, config: EvollaConfig): self.sequence_compressor_resampler = EvollaSequenceCompressorResampler(config=config) @can_return_tuple - def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs): + def forward( + self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs + ) -> EvollaProteinEncoderModelOutput: protein_output = self.model(input_ids=input_ids, attention_mask=attention_mask) protein_embeds = protein_output.last_hidden_state sequence_repr = self.sequence_compressor_resampler(protein_embeds, attention_mask) @@ -785,7 +787,7 @@ def forward( structure_batch_mask: torch.Tensor | None = None, msa_batch_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" protein_input_ids (torch.LongTensor): The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`. @@ -900,7 +902,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ): + ) -> CausalLMOutputWithPast: r""" protein_input_ids (torch.LongTensor): The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`. diff --git a/src/transformers/models/exaone4/modeling_exaone4.py b/src/transformers/models/exaone4/modeling_exaone4.py index 4dd958af0138..eb0a1ccb169b 100644 --- a/src/transformers/models/exaone4/modeling_exaone4.py +++ b/src/transformers/models/exaone4/modeling_exaone4.py @@ -389,7 +389,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index 6c8f98a5cb57..4cdb93eb4675 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -342,7 +342,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/exaone_moe/modeling_exaone_moe.py b/src/transformers/models/exaone_moe/modeling_exaone_moe.py index 35754ab51566..82c7a392a3ed 100644 --- a/src/transformers/models/exaone_moe/modeling_exaone_moe.py +++ b/src/transformers/models/exaone_moe/modeling_exaone_moe.py @@ -512,7 +512,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py index 3bdd8427da30..0b477426b2c8 100644 --- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py @@ -1301,7 +1301,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, # NOOP kwargs, for now - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1430,7 +1430,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" Example: diff --git a/src/transformers/models/falcon_h1/modular_falcon_h1.py b/src/transformers/models/falcon_h1/modular_falcon_h1.py index 5b6e40194c4c..1e32b632ba65 100644 --- a/src/transformers/models/falcon_h1/modular_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modular_falcon_h1.py @@ -1028,7 +1028,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, # NOOP kwargs, for now - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1141,7 +1141,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" Example: diff --git a/src/transformers/models/fast_vlm/modeling_fast_vlm.py b/src/transformers/models/fast_vlm/modeling_fast_vlm.py index 25e84f43b957..16e918ccf828 100644 --- a/src/transformers/models/fast_vlm/modeling_fast_vlm.py +++ b/src/transformers/models/fast_vlm/modeling_fast_vlm.py @@ -125,7 +125,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -184,7 +184,7 @@ def forward( vision_feature_select_strategy: str | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | FastVlmModelOutputWithPast: + ) -> FastVlmModelOutputWithPast: r""" vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*): The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the @@ -314,7 +314,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | FastVlmCausalLMOutputWithPast: + ) -> FastVlmCausalLMOutputWithPast: r""" vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*): The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the diff --git a/src/transformers/models/fast_vlm/modular_fast_vlm.py b/src/transformers/models/fast_vlm/modular_fast_vlm.py index 39d9bbe1fc6e..44287a4b3a81 100644 --- a/src/transformers/models/fast_vlm/modular_fast_vlm.py +++ b/src/transformers/models/fast_vlm/modular_fast_vlm.py @@ -191,7 +191,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -226,7 +226,7 @@ def forward( vision_feature_select_strategy: str | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | FastVlmModelOutputWithPast: + ) -> FastVlmModelOutputWithPast: r""" vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*): The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the @@ -299,7 +299,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | FastVlmCausalLMOutputWithPast: + ) -> FastVlmCausalLMOutputWithPast: r""" vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*): The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 264528def22e..d962a02bfe6a 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -1007,7 +1007,7 @@ def get_text_features( token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -1058,7 +1058,7 @@ def get_image_features( interpolate_pos_encoding: bool | None = None, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`): Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py index 3bea6cbeebc4..18f5d9a3681b 100644 --- a/src/transformers/models/florence2/modeling_florence2.py +++ b/src/transformers/models/florence2/modeling_florence2.py @@ -555,9 +555,7 @@ def __init__(self, config: Florence2VisionConfig): @merge_with_config_defaults @capture_outputs - def forward( - self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: for conv, block in zip(self.convs, self.blocks): hidden_states = conv(hidden_states) for layer in block: @@ -689,7 +687,7 @@ def set_input_embeddings(self, value): ) def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -742,7 +740,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Florence2Seq2SeqModelOutput: + ) -> Florence2Seq2SeqModelOutput: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -879,7 +877,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Florence2Seq2SeqLMOutput: + ) -> Florence2Seq2SeqLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py index b2f7b7437eda..18a504e1ba67 100644 --- a/src/transformers/models/florence2/modular_florence2.py +++ b/src/transformers/models/florence2/modular_florence2.py @@ -1406,9 +1406,7 @@ def __init__(self, config: Florence2VisionConfig): @merge_with_config_defaults @capture_outputs - def forward( - self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: for conv, block in zip(self.convs, self.blocks): hidden_states = conv(hidden_states) for layer in block: @@ -1526,7 +1524,7 @@ def get_encoder(self, modality=None): ) def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -1555,7 +1553,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Florence2Seq2SeqModelOutput: + ) -> Florence2Seq2SeqModelOutput: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1655,7 +1653,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Florence2Seq2SeqLMOutput: + ) -> Florence2Seq2SeqLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index 87d3df39752c..109e031e05b6 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -118,7 +118,7 @@ def gather_continuous_embeddings( @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -269,7 +269,7 @@ def forward( return_dict: bool | None = None, logits_to_keep: int | None = 0, **kwargs, - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*): Image patches to be used as continuous embeddings. The patches are flattened and then projected to the diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index d2099bb0edd7..708e00ac3314 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -846,7 +846,7 @@ def set_input_embeddings(self, value): @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.") def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) last_hidden_state = vision_outputs.last_hidden_state vision_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) @@ -892,7 +892,7 @@ def forward( labels: torch.LongTensor | None = None, use_cache: bool | None = None, **lm_kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3ModelOutputWithPast: + ) -> Gemma3ModelOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -1034,7 +1034,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3CausalLMOutputWithPast: + ) -> Gemma3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 756356d85ea4..78956cce5ac8 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -810,7 +810,7 @@ def __init__(self, config: Gemma3Config): @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.") def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) last_hidden_state = vision_outputs.last_hidden_state vision_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) @@ -832,7 +832,7 @@ def forward( labels: torch.LongTensor | None = None, use_cache: bool | None = None, **lm_kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3ModelOutputWithPast: + ) -> Gemma3ModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -917,7 +917,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3CausalLMOutputWithPast: + ) -> Gemma3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index e22def7b0d87..73889e0cadc6 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -1447,7 +1447,7 @@ def __init__(self, config: Gemma3nAudioConfig): @capture_outputs def forward( self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: """Encodes a batch of MELs. Args: @@ -1956,7 +1956,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_tower(pixel_values=pixel_values, do_pooling=False, return_dict=True, **kwargs) last_hidden_state = vision_outputs.last_hidden_state # Convert from (batch, channels, height, width) to (batch, height * width, channels) where: @@ -2169,7 +2169,7 @@ def get_audio_features( input_features: torch.Tensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: r""" input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`): The tensors corresponding to the input audio. diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index a97cc2823c7b..b038e2005116 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -1898,7 +1898,7 @@ def __init__(self, config: Gemma3nAudioConfig): @capture_outputs def forward( self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: """Encodes a batch of MELs. Args: @@ -2228,7 +2228,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_tower(pixel_values=pixel_values, do_pooling=False, return_dict=True, **kwargs) last_hidden_state = vision_outputs.last_hidden_state # Convert from (batch, channels, height, width) to (batch, height * width, channels) where: @@ -2441,7 +2441,7 @@ def get_audio_features( input_features: torch.Tensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: r""" input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`): The tensors corresponding to the input audio. diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 15a0395a1962..5a1a973e1c2f 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -770,7 +770,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py index 1f683a8bdb06..6eaecff538c1 100644 --- a/src/transformers/models/glm4/modeling_glm4.py +++ b/src/transformers/models/glm4/modeling_glm4.py @@ -482,7 +482,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4/modular_glm4.py b/src/transformers/models/glm4/modular_glm4.py index b9ec811fbc3d..f9feb89995e2 100644 --- a/src/transformers/models/glm4/modular_glm4.py +++ b/src/transformers/models/glm4/modular_glm4.py @@ -92,7 +92,7 @@ class Glm4ForCausalLM(GlmForCausalLM): def forward( self, **super_kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm46v/modeling_glm46v.py b/src/transformers/models/glm46v/modeling_glm46v.py index f84278963268..93a412b0089e 100644 --- a/src/transformers/models/glm46v/modeling_glm46v.py +++ b/src/transformers/models/glm46v/modeling_glm46v.py @@ -279,7 +279,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -310,7 +310,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -427,7 +427,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm46VModelOutputWithPast: + ) -> Glm46VModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -579,7 +579,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm46VCausalLMOutputWithPast: + ) -> Glm46VCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index cc65db0ced9e..80277c435d6e 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -762,7 +762,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): The final hidden states of the model. @@ -854,7 +854,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1131,7 +1131,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1162,7 +1162,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1279,7 +1279,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vModelOutputWithPast: + ) -> Glm4vModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1431,7 +1431,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vCausalLMOutputWithPast: + ) -> Glm4vCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 3e607eedfc75..a018dd20f09e 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -792,7 +792,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): The final hidden states of the model. @@ -874,7 +874,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -963,7 +963,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1163,7 +1163,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vModelOutputWithPast: + ) -> Glm4vModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1240,7 +1240,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vCausalLMOutputWithPast: + ) -> Glm4vCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py index 4a012475a954..02b1eab3745e 100644 --- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py @@ -828,7 +828,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): The final hidden states of the model. @@ -997,7 +997,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1301,7 +1301,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1332,7 +1332,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1449,7 +1449,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vMoeModelOutputWithPast: + ) -> Glm4vMoeModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1656,7 +1656,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vMoeCausalLMOutputWithPast: + ) -> Glm4vMoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index 61bf605868a8..b375ed4af3c3 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -416,7 +416,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -520,7 +520,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vMoeCausalLMOutputWithPast: + ) -> Glm4vMoeCausalLMOutputWithPast: outputs = self.model( input_ids=input_ids, pixel_values=pixel_values, diff --git a/src/transformers/models/glm_image/modeling_glm_image.py b/src/transformers/models/glm_image/modeling_glm_image.py index 4639283fe8e5..2fec170db884 100644 --- a/src/transformers/models/glm_image/modeling_glm_image.py +++ b/src/transformers/models/glm_image/modeling_glm_image.py @@ -722,7 +722,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`): Packed pixel values. @@ -871,7 +871,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1206,7 +1206,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1301,7 +1301,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GlmImageModelOutputWithPast: + ) -> GlmImageModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. diff --git a/src/transformers/models/glm_image/modular_glm_image.py b/src/transformers/models/glm_image/modular_glm_image.py index f9f207f1068a..6aef0bdc4bec 100644 --- a/src/transformers/models/glm_image/modular_glm_image.py +++ b/src/transformers/models/glm_image/modular_glm_image.py @@ -603,7 +603,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`): Packed pixel values. @@ -871,7 +871,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -964,7 +964,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GlmImageModelOutputWithPast: + ) -> GlmImageModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. diff --git a/src/transformers/models/glm_ocr/modeling_glm_ocr.py b/src/transformers/models/glm_ocr/modeling_glm_ocr.py index 8e37722918d4..2648e351f45f 100644 --- a/src/transformers/models/glm_ocr/modeling_glm_ocr.py +++ b/src/transformers/models/glm_ocr/modeling_glm_ocr.py @@ -770,7 +770,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1047,7 +1047,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1078,7 +1078,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1195,7 +1195,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GlmOcrModelOutputWithPast: + ) -> GlmOcrModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1347,7 +1347,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GlmOcrCausalLMOutputWithPast: + ) -> GlmOcrCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glmasr/modeling_glmasr.py b/src/transformers/models/glmasr/modeling_glmasr.py index 09f65935d8b5..50b1d31722c8 100644 --- a/src/transformers/models/glmasr/modeling_glmasr.py +++ b/src/transformers/models/glmasr/modeling_glmasr.py @@ -313,7 +313,7 @@ def __init__(self, config: GlmAsrEncoderConfig): @merge_with_config_defaults @capture_outputs @auto_docstring - def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]): + def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: inputs_embeds = nn.functional.gelu(self.conv1(input_features)) inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) inputs_embeds = inputs_embeds.transpose(1, 2) @@ -396,7 +396,7 @@ def get_audio_features( input_features: torch.FloatTensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/glmasr/modular_glmasr.py b/src/transformers/models/glmasr/modular_glmasr.py index 35f0743b7d06..4cffc8d29fb5 100644 --- a/src/transformers/models/glmasr/modular_glmasr.py +++ b/src/transformers/models/glmasr/modular_glmasr.py @@ -327,7 +327,7 @@ def __init__(self, config: GlmAsrEncoderConfig): @merge_with_config_defaults @capture_outputs @auto_docstring - def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]): + def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: inputs_embeds = nn.functional.gelu(self.conv1(input_features)) inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) inputs_embeds = inputs_embeds.transpose(1, 2) @@ -366,7 +366,7 @@ def get_audio_features( input_features: torch.FloatTensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: audio_outputs = self.audio_tower(input_features, return_dict=True, **kwargs) audio_hidden_states = audio_outputs.last_hidden_state audio_hidden_states = audio_hidden_states.reshape( diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py index 646b565b9582..ced0e68331ac 100644 --- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -439,7 +439,7 @@ def get_input_embeddings(self): @capture_outputs(tie_last_hidden_states=False) def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | GotOcr2VisionEncoderOutput: + ) -> GotOcr2VisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -558,7 +558,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) last_hidden_state = image_outputs.last_hidden_state image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) @@ -605,7 +605,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GotOcr2ModelOutputWithPast: + ) -> GotOcr2ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -703,7 +703,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GotOcr2CausalLMOutputWithPast: + ) -> GotOcr2CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py index 2cd299fa4bc7..5a9cfed11476 100644 --- a/src/transformers/models/got_ocr2/modular_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py @@ -313,7 +313,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) last_hidden_state = image_outputs.last_hidden_state image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) @@ -334,7 +334,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GotOcr2ModelOutputWithPast: + ) -> GotOcr2ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -398,7 +398,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GotOcr2CausalLMOutputWithPast: + ) -> GotOcr2CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 22c22541a8f3..6645d62e2dd5 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -415,7 +415,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`): `input_ids_length` = `sequence_length` if `past_key_values` is `None` else diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 66a3bfec122e..c43c2c8cf4a2 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -447,7 +447,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py index e22c93c1fb4d..70588867941a 100644 --- a/src/transformers/models/gpt_neox/modular_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py @@ -389,7 +389,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index 43359ec98b7e..d7c7f1642deb 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -306,9 +306,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig): @merge_with_config_defaults @capture_outputs - def forward( - self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: hidden_states = self.input_linear(hidden_states) for idx, layer in enumerate(self.layers, start=1): hidden_states = layer(hidden_states, attention_dists=self.attention_dists) @@ -370,7 +368,7 @@ def get_output_embeddings(self): @auto_docstring def get_audio_features( self, input_features: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: audio_outputs = self.encoder(input_features, return_dict=True, **kwargs) projected_embeds = self.projector(audio_outputs.last_hidden_state) audio_outputs.pooler_output = projected_embeds diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index e6d98911f362..14d1db14e4a7 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -667,7 +667,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/granitemoe/modular_granitemoe.py b/src/transformers/models/granitemoe/modular_granitemoe.py index 88c50171096e..2bbae8f7f5a4 100644 --- a/src/transformers/models/granitemoe/modular_granitemoe.py +++ b/src/transformers/models/granitemoe/modular_granitemoe.py @@ -249,7 +249,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py index 2e1625742cce..02cb525ada8e 100644 --- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py @@ -1298,7 +1298,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[GraniteFlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1481,7 +1481,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py index 9b9bd65bf9b0..c0990de9661a 100644 --- a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py @@ -223,7 +223,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[GraniteFlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index 91f6a4ed5158..e451f780a91a 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -736,7 +736,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 7d825656569e..b05fe7f7f141 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -1222,7 +1222,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1255,7 +1255,7 @@ def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py index f760bc611f80..f1a06f4221a9 100644 --- a/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py +++ b/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py @@ -675,7 +675,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> CausalLMOutputWithPast: r""" audio_input_ids (`torch.LongTensor` of shape `(batch_size, num_audio_frames, num_codebooks)`, *optional*): Indices of audio codebook tokens. diff --git a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py index d7bbce5248b2..646693131039 100644 --- a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py +++ b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py @@ -557,7 +557,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> CausalLMOutputWithPast: r""" audio_input_ids (`torch.LongTensor` of shape `(batch_size, num_audio_frames, num_codebooks)`, *optional*): Indices of audio codebook tokens. diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index b730b98acbe4..0dff83fca02a 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -957,7 +957,7 @@ def forward( interpolate_pos_encoding: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | IdeficsBaseModelOutputWithPast: + ) -> IdeficsBaseModelOutputWithPast: r""" image_encoder_embeddings (`torch.FloatTensor`, *optional*): The output of the image encoder. @@ -1145,7 +1145,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | IdeficsCausalLMOutputWithPast: + ) -> IdeficsCausalLMOutputWithPast: r""" image_encoder_embeddings (`torch.FloatTensor`, *optional*): The output of the image encoder. diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py index 2cc51ac34993..1352519de55a 100644 --- a/src/transformers/models/idefics/vision.py +++ b/src/transformers/models/idefics/vision.py @@ -356,7 +356,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 92c88a1cbd1b..8d3e75dd0b66 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -470,7 +470,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" patch_attention_mask (`torch.BoolTensor` of shape `(batch_size, num_patches_height, num_patches_width)`, *optional*): The attention mask for the patches. @@ -833,7 +833,7 @@ def get_image_features( pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -905,7 +905,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Idefics2BaseModelOutputWithPast: + ) -> Idefics2BaseModelOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. @@ -1034,7 +1034,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Idefics2CausalLMOutputWithPast: + ) -> Idefics2CausalLMOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index 0925222c416d..81f7346b0872 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -478,7 +478,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: batch_size = pixel_values.size(0) if patch_attention_mask is None: patch_size = self.patch_size @@ -579,7 +579,7 @@ def get_image_features( pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -652,7 +652,7 @@ def forward( cache_position: torch.LongTensor | None = None, return_dict: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Idefics3BaseModelOutputWithPast: + ) -> Idefics3BaseModelOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. @@ -792,7 +792,7 @@ def forward( return_dict: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Idefics3CausalLMOutputWithPast: + ) -> Idefics3CausalLMOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 29f32f17d6c4..517ccd608a2a 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -415,7 +415,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -714,7 +714,7 @@ def forward( encoder_attention_mask=None, query_length=0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithPastAndCrossAttentions: for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] @@ -874,7 +874,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" query_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Hidden states to be used in the attention computation. If cross-attention, @@ -1015,7 +1015,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | InstructBlipForConditionalGenerationModelOutput: + ) -> InstructBlipForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided @@ -1187,7 +1187,7 @@ def get_image_features( qformer_attention_mask: torch.LongTensor | None = None, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisionQformerOutputs: + ) -> BaseModelOutputWithVisionQformerOutputs: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1275,7 +1275,7 @@ def forward( labels: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InstructBlipForConditionalGenerationModelOutput: + ) -> InstructBlipForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index b6462eda4cf0..fd0093135daf 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -420,7 +420,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -714,7 +714,7 @@ def forward( encoder_attention_mask=None, query_length=0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithPastAndCrossAttentions: for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] @@ -824,7 +824,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" query_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Hidden states to be used in the attention computation. If cross-attention, @@ -1004,7 +1004,7 @@ def forward( interpolate_pos_encoding: bool = False, use_cache: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput: + ) -> InstructBlipVideoForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided @@ -1243,7 +1243,7 @@ def forward( interpolate_pos_encoding: bool = False, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput: + ) -> InstructBlipVideoForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)): The sequence used as a prompt to be fed to the Q-Former module. @@ -1452,7 +1452,7 @@ def get_video_features( qformer_attention_mask: torch.LongTensor | None = None, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisionQformerOutputs: + ) -> BaseModelOutputWithVisionQformerOutputs: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 8b76bfc6fba1..6e0a58f12439 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -192,7 +192,7 @@ def forward( interpolate_pos_encoding: bool = False, use_cache: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput: + ) -> InstructBlipVideoForConditionalGenerationModelOutput: return_dict = return_dict if return_dict is not None else self.config.use_return_dict # step 1: forward the images through the vision encoder, @@ -294,7 +294,7 @@ def get_video_features( qformer_attention_mask: torch.LongTensor | None = None, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisionQformerOutputs: + ) -> BaseModelOutputWithVisionQformerOutputs: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -393,7 +393,7 @@ def forward( interpolate_pos_encoding: bool = False, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput: + ) -> InstructBlipVideoForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)): The sequence used as a prompt to be fed to the Q-Former module. diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py index 8a5158d2e78e..532513ee516e 100644 --- a/src/transformers/models/internvl/modeling_internvl.py +++ b/src/transformers/models/internvl/modeling_internvl.py @@ -448,7 +448,7 @@ def get_input_embeddings(self): @auto_docstring def forward( self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor | None = None, **kwargs - ) -> tuple | InternVLVisionModelOutputWithPooling: + ) -> InternVLVisionModelOutputWithPooling: r""" bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). @@ -556,7 +556,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. @@ -634,7 +634,7 @@ def forward( vision_feature_select_strategy: str | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InternVLModelOutputWithPast: + ) -> InternVLModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -798,7 +798,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InternVLCausalLMOutputWithPast: + ) -> InternVLCausalLMOutputWithPast: r""" Example: diff --git a/src/transformers/models/internvl/modular_internvl.py b/src/transformers/models/internvl/modular_internvl.py index ecaf48604395..63a215e797ae 100644 --- a/src/transformers/models/internvl/modular_internvl.py +++ b/src/transformers/models/internvl/modular_internvl.py @@ -402,7 +402,7 @@ def get_input_embeddings(self): @auto_docstring def forward( self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor | None = None, **kwargs - ) -> tuple | InternVLVisionModelOutputWithPooling: + ) -> InternVLVisionModelOutputWithPooling: r""" bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). @@ -496,7 +496,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. @@ -550,7 +550,7 @@ def forward( vision_feature_select_strategy: str | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InternVLModelOutputWithPast: + ) -> InternVLModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py index a2a10d8b00f3..112742c3fc10 100644 --- a/src/transformers/models/janus/modeling_janus.py +++ b/src/transformers/models/janus/modeling_janus.py @@ -460,7 +460,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1003,7 +1003,7 @@ def set_input_embeddings(self, value): @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_model(pixel_values, return_dict=True, **kwargs) vision_outputs.pooler_output = self.aligner(vision_outputs.last_hidden_state) diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index 25ac255a6466..afac4b3e711d 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -566,7 +566,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -942,7 +942,7 @@ def set_input_embeddings(self, value): @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_model(pixel_values, return_dict=True, **kwargs) vision_outputs.pooler_output = self.aligner(vision_outputs.last_hidden_state) diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index ad231d3cd2f1..903e8466b76d 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -448,7 +448,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -1256,7 +1256,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*): Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`. @@ -1331,7 +1331,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*): Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`. @@ -1506,7 +1506,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithProjectionAttentions: + ) -> BaseModelOutputWithProjectionAttentions: if "return_attentions" in kwargs: warnings.warn( "`return_attentions` is deprecated and will be removed in a future version. Please use `return_dict`" @@ -1549,7 +1549,7 @@ def forward( interpolate_pos_encoding: bool = False, return_dict: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Kosmos2ModelOutput: + ) -> Kosmos2ModelOutput: r""" image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0, @@ -1687,7 +1687,7 @@ def forward( output_hidden_states: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Kosmos2ForConditionalGenerationModelOutput: + ) -> Kosmos2ForConditionalGenerationModelOutput: r""" image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0, diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index d69359c1d4d9..76fb26f1e640 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -326,7 +326,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -466,7 +466,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, @@ -602,7 +602,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, @@ -719,7 +719,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, @@ -854,7 +854,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, @@ -968,7 +968,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, diff --git a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py index f3ab420c64d9..bc438a11fa20 100755 --- a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py @@ -171,7 +171,7 @@ def get_image_features( spatial_shapes: torch.Tensor, pixel_attention_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -250,7 +250,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Lfm2VlModelOutputWithPast: + ) -> Lfm2VlModelOutputWithPast: r""" spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): The spatial shapes of the input images. @@ -361,7 +361,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Lfm2VlCausalLMOutputWithPast: + ) -> Lfm2VlCausalLMOutputWithPast: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`, *optional*): The input image tensors. diff --git a/src/transformers/models/lfm2_vl/modular_lfm2_vl.py b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py index 319f685d46d3..c8ef01fc3081 100644 --- a/src/transformers/models/lfm2_vl/modular_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py @@ -102,7 +102,7 @@ def get_image_features( spatial_shapes: torch.Tensor, pixel_attention_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -181,7 +181,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Lfm2VlModelOutputWithPast: + ) -> Lfm2VlModelOutputWithPast: r""" spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): The spatial shapes of the input images. @@ -271,7 +271,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Lfm2VlCausalLMOutputWithPast: + ) -> Lfm2VlCausalLMOutputWithPast: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`, *optional*): The input image tensors. diff --git a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py index e5893fe35301..bd812009781d 100644 --- a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py +++ b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py @@ -174,7 +174,7 @@ def set_input_embeddings(self, value): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, image_sizes: torch.Tensor | list, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_encoder(pixel_values, image_sizes=image_sizes, return_dict=True, **kwargs) image_features = image_outputs.last_hidden_state image_features = self.vision_projection(image_features.squeeze(0), image_sizes) @@ -228,7 +228,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LightOnOcrModelOutputWithPast: + ) -> LightOnOcrModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -352,7 +352,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LightOnOcrCausalLMOutputWithPast: + ) -> LightOnOcrCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/lighton_ocr/modular_lighton_ocr.py b/src/transformers/models/lighton_ocr/modular_lighton_ocr.py index efeb35b4af8b..8a00c6134153 100644 --- a/src/transformers/models/lighton_ocr/modular_lighton_ocr.py +++ b/src/transformers/models/lighton_ocr/modular_lighton_ocr.py @@ -313,7 +313,7 @@ def __init__(self, config: LightOnOcrConfig): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, image_sizes: torch.Tensor | list, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_encoder(pixel_values, image_sizes=image_sizes, return_dict=True, **kwargs) image_features = image_outputs.last_hidden_state image_features = self.vision_projection(image_features.squeeze(0), image_sizes) @@ -343,7 +343,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LightOnOcrModelOutputWithPast: + ) -> LightOnOcrModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index e767ac10b10d..582ed8d0515f 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -535,7 +535,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -625,7 +625,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -1219,7 +1219,7 @@ def get_image_features( pixel_values: torch.FloatTensor, vision_feature_select_strategy: str, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. @@ -1273,7 +1273,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Llama4CausalLMOutputWithPast: + ) -> Llama4CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index a52aaa1cda51..3289c1f4c88a 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -158,7 +158,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -237,7 +237,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaModelOutputWithPast: + ) -> LlavaModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -337,7 +337,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaCausalLMOutputWithPast: + ) -> LlavaCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index ad3d673015ab..fd885b03649c 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -357,7 +357,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) The tensors corresponding to the input images. @@ -461,7 +461,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaNextModelOutputWithPast: + ) -> LlavaNextModelOutputWithPast: r""" vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. @@ -604,7 +604,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaNextCausalLMOutputWithPast: + ) -> LlavaNextCausalLMOutputWithPast: r""" vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 3d4e1a531418..bf35cedc6ec7 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -409,7 +409,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) The tensors corresponding to the input images. @@ -530,7 +530,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaNextVideoModelOutputWithPast: + ) -> LlavaNextVideoModelOutputWithPast: r""" vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. @@ -615,7 +615,7 @@ def get_video_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. @@ -746,7 +746,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaNextVideoCausalLMOutputWithPast: + ) -> LlavaNextVideoCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 291c8c03c6ef..0aaa0a7e6418 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -289,7 +289,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) The tensors corresponding to the input images. @@ -361,7 +361,7 @@ def get_video_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. @@ -462,7 +462,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaNextVideoModelOutputWithPast: + ) -> LlavaNextVideoModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -579,7 +579,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaNextVideoCausalLMOutputWithPast: + ) -> LlavaNextVideoCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index c86c7bc260f6..7e26a1196143 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -368,7 +368,7 @@ def get_image_features( batch_num_images: torch.LongTensor | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" image_sizes (`torch.Tensor` of shape `(num_images, 2)`): Actual image size of each images (H, W). @@ -494,7 +494,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaOnevisionModelOutputWithPast: + ) -> LlavaOnevisionModelOutputWithPast: r""" image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*): The sizes of the videos in the batch, being (height, width) for each frame in the video. @@ -585,7 +585,7 @@ def get_video_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. @@ -731,7 +731,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaOnevisionCausalLMOutputWithPast: + ) -> LlavaOnevisionCausalLMOutputWithPast: r""" image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*): The sizes of the videos in the batch, being (height, width) for each frame in the video. diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index f1da402f72bb..63aed8fad86e 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -325,7 +325,7 @@ def get_image_features( batch_num_images: torch.LongTensor | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" image_sizes (`torch.Tensor` of shape `(num_images, 2)`): Actual image size of each images (H, W). @@ -399,7 +399,7 @@ def get_video_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. @@ -461,7 +461,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaOnevisionModelOutputWithPast: + ) -> LlavaOnevisionModelOutputWithPast: r""" image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*): The sizes of the videos in the batch, being (height, width) for each frame in the video. @@ -567,7 +567,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaOnevisionCausalLMOutputWithPast: + ) -> LlavaOnevisionCausalLMOutputWithPast: r""" image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*): The sizes of the videos in the batch, being (height, width) for each frame in the video. diff --git a/src/transformers/models/lw_detr/modeling_lw_detr.py b/src/transformers/models/lw_detr/modeling_lw_detr.py index 08aeb6bcea67..44f9ce107794 100644 --- a/src/transformers/models/lw_detr/modeling_lw_detr.py +++ b/src/transformers/models/lw_detr/modeling_lw_detr.py @@ -1133,7 +1133,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> LwDetrDecoderOutput: intermediate = () intermediate_reference_points = (reference_points,) diff --git a/src/transformers/models/lw_detr/modular_lw_detr.py b/src/transformers/models/lw_detr/modular_lw_detr.py index d783f8adcacd..d2b0a96088e1 100644 --- a/src/transformers/models/lw_detr/modular_lw_detr.py +++ b/src/transformers/models/lw_detr/modular_lw_detr.py @@ -1109,7 +1109,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> LwDetrDecoderOutput: intermediate = () intermediate_reference_points = (reference_points,) diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index 7101bb8685d4..8f1eafa11799 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -474,7 +474,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -558,7 +558,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*): Tag IDs for each token in the input sequence, padded up to config.max_depth. @@ -666,7 +666,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*): Tag IDs for each token in the input sequence, padded up to config.max_depth. @@ -782,7 +782,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*): Tag IDs for each token in the input sequence, padded up to config.max_depth. @@ -885,7 +885,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*): Tag IDs for each token in the input sequence, padded up to config.max_depth. diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index d79df8c0cd40..04ae01fae9b5 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -811,7 +811,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -843,7 +843,7 @@ def get_image_features( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index ea01a965d858..15944857b61f 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -559,7 +559,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -587,7 +587,7 @@ def get_image_features( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/minimax/modeling_minimax.py b/src/transformers/models/minimax/modeling_minimax.py index 028068a78a77..a75920796aed 100644 --- a/src/transformers/models/minimax/modeling_minimax.py +++ b/src/transformers/models/minimax/modeling_minimax.py @@ -655,7 +655,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 15de27f09dd3..09cff1d0bec3 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -547,7 +547,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index a48af97eb1dc..9707eaf7ec3f 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -227,7 +227,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -297,7 +297,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Mistral3ModelOutputWithPast: + ) -> Mistral3ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -408,7 +408,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Mistral3CausalLMOutputWithPast: + ) -> Mistral3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/mistral3/modular_mistral3.py b/src/transformers/models/mistral3/modular_mistral3.py index d0c3975ba9de..63d4d631dd49 100644 --- a/src/transformers/models/mistral3/modular_mistral3.py +++ b/src/transformers/models/mistral3/modular_mistral3.py @@ -132,7 +132,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -178,7 +178,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Mistral3ModelOutputWithPast: + ) -> Mistral3ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -259,7 +259,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Mistral3CausalLMOutputWithPast: + ) -> Mistral3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index 8d0ec26c23cf..a1f90104154d 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -479,7 +479,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -535,7 +535,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Example: diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py index b8cc9c4ba6af..4e2442b4e024 100644 --- a/src/transformers/models/mlcd/modular_mlcd.py +++ b/src/transformers/models/mlcd/modular_mlcd.py @@ -399,7 +399,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -435,7 +435,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Example: diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index 8ebfd9f2bf85..01281cd3fdb6 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1241,7 +1241,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" cross_attention_states (`torch.FloatTensor`, *optional*): Output of the vision model, used for cross-attention. This tensor contains the processed image features that @@ -1498,7 +1498,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" aspect_ratio_mask (`torch.Tensor` of shape `(batch_size, max_num_images, max_num_tiles)`, *optional*): Mask to avoid performing attention on padding tiles. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index cc9f7ee4eecb..b99ac7696098 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -623,7 +623,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -701,7 +701,7 @@ def forward( labels: torch.LongTensor | None = None, next_sentence_label: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MobileBertForPreTrainingOutput: + ) -> MobileBertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -799,7 +799,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -868,7 +868,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | NextSentencePredictorOutput: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -955,7 +955,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1033,7 +1033,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.mobilebert( input_ids, attention_mask=attention_mask, @@ -1104,7 +1104,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1204,7 +1204,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py index e5b53c008c31..e68ee9de7b0d 100644 --- a/src/transformers/models/modernbert/modeling_modernbert.py +++ b/src/transformers/models/modernbert/modeling_modernbert.py @@ -539,7 +539,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, @@ -602,7 +602,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -692,7 +692,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -746,7 +746,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.model( input_ids, attention_mask=attention_mask, @@ -804,7 +804,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 82f2cf5dfada..e5f76b08f7b5 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -694,7 +694,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, @@ -757,7 +757,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -847,7 +847,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -901,7 +901,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.model( input_ids, attention_mask=attention_mask, @@ -959,7 +959,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., diff --git a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py index 86e7b628ec33..9f7020ccd357 100644 --- a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py @@ -477,7 +477,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor, ...] | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) == (inputs_embeds is None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -582,7 +582,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -685,7 +685,7 @@ def forward( labels: torch.LongTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutputWithPast: + ) -> SequenceClassifierOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index 37ec89a4c864..77820df2f41b 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -532,7 +532,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor, ...] | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) == (inputs_embeds is None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -637,7 +637,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -740,7 +740,7 @@ def forward( labels: torch.LongTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutputWithPast: + ) -> SequenceClassifierOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 4345d3634da8..039cf9c64dd1 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -564,7 +564,7 @@ def forward( input_values: torch.FloatTensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" Args: input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): @@ -658,7 +658,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index fe237200ffc1..e5e3c93ff11d 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -530,7 +530,7 @@ def forward( input_values: torch.FloatTensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" Args: input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): @@ -614,7 +614,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention diff --git a/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py b/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py index a58780b73252..123af47a918b 100644 --- a/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py +++ b/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py @@ -819,7 +819,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention diff --git a/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py b/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py index 2f1f6058316a..3f68f2622766 100644 --- a/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py +++ b/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py @@ -363,7 +363,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index 313b574518f4..c13571a89c84 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -718,7 +718,7 @@ def forward( attention_mask: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> MoEModelOutput: if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -808,7 +808,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -916,7 +916,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | Seq2SeqMoEModelOutput: + ) -> Seq2SeqMoEModelOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, @@ -1088,7 +1088,7 @@ def forward( output_router_logits: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | Seq2SeqMoEOutput: + ) -> Seq2SeqMoEOutput: output_router_logits = ( output_router_logits if output_router_logits is not None else self.config.output_router_logits ) diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index 319d80532b94..f9501455cb34 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -354,7 +354,7 @@ def forward( position_ids: torch.LongTensor | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): @@ -539,7 +539,7 @@ def forward( position_ids: torch.LongTensor | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -606,7 +606,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/ovis2/modeling_ovis2.py b/src/transformers/models/ovis2/modeling_ovis2.py index e9967d9e1e82..dab8e2f99e15 100644 --- a/src/transformers/models/ovis2/modeling_ovis2.py +++ b/src/transformers/models/ovis2/modeling_ovis2.py @@ -344,7 +344,7 @@ def forward( pixel_values, attention_mask: torch.Tensor | None = None, **kwargs, - ): + ) -> BaseModelOutput: hidden_states = self.embeddings(pixel_values) encoder_outputs: BaseModelOutput = self.encoder( @@ -423,7 +423,7 @@ def __init__(self, config: Ovis2VisionConfig): @capture_outputs def forward( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures: + ) -> BaseModelOutputWithVisualIndicatorFeatures: outputs = self.transformer(pixel_values, **kwargs) last_hidden_state = outputs[0] if self.config.hidden_stride > 1: @@ -495,7 +495,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures: + ) -> BaseModelOutputWithVisualIndicatorFeatures: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) image_features = image_outputs.pooler_output batch_size, img_seq_len, _ = image_features.shape @@ -561,7 +561,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | Ovis2ModelOutputWithPast: + ) -> Ovis2ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -668,7 +668,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | Ovis2CausalLMOutputWithPast: + ) -> Ovis2CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/ovis2/modular_ovis2.py b/src/transformers/models/ovis2/modular_ovis2.py index 6990fc50301a..bc2906329fe4 100644 --- a/src/transformers/models/ovis2/modular_ovis2.py +++ b/src/transformers/models/ovis2/modular_ovis2.py @@ -137,7 +137,7 @@ def forward( pixel_values, attention_mask: torch.Tensor | None = None, **kwargs, - ): + ) -> BaseModelOutput: hidden_states = self.embeddings(pixel_values) encoder_outputs: BaseModelOutput = self.encoder( @@ -206,7 +206,7 @@ def __init__(self, config: Ovis2VisionConfig): @capture_outputs def forward( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures: + ) -> BaseModelOutputWithVisualIndicatorFeatures: outputs = self.transformer(pixel_values, **kwargs) last_hidden_state = outputs[0] if self.config.hidden_stride > 1: @@ -267,7 +267,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures: + ) -> BaseModelOutputWithVisualIndicatorFeatures: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) image_features = image_outputs.pooler_output batch_size, img_seq_len, _ = image_features.shape @@ -309,7 +309,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | Ovis2ModelOutputWithPast: + ) -> Ovis2ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -404,7 +404,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | Ovis2CausalLMOutputWithPast: + ) -> Ovis2CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index 3c18262188ce..5c94cdb56cfc 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -975,7 +975,7 @@ def get_text_features( input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -1014,7 +1014,7 @@ def get_image_features( pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: ```python diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 78e45a3fe6a3..e502ca4ab404 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -956,7 +956,7 @@ def get_text_features( input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -995,7 +995,7 @@ def get_image_features( pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: ```python diff --git a/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py index 4720dd398b3c..cd9ca8cd61d7 100644 --- a/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py @@ -1228,7 +1228,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1340,7 +1340,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | PaddleOCRVLModelOutputWithPast: + ) -> PaddleOCRVLModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1443,7 +1443,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | PaddleOCRVLCausalLMOutputWithPast: + ) -> PaddleOCRVLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py index dc5bb2df1dcb..da7abd350da6 100644 --- a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py @@ -1158,7 +1158,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1227,7 +1227,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | PaddleOCRVLModelOutputWithPast: + ) -> PaddleOCRVLModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1304,7 +1304,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | PaddleOCRVLCausalLMOutputWithPast: + ) -> PaddleOCRVLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index 15f2071ee2bc..aaf8a27f6d29 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -266,7 +266,7 @@ def set_input_embeddings(self, value): ) def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) selected_image_feature = image_outputs.last_hidden_state image_features = self.multi_modal_projector(selected_image_feature) @@ -317,7 +317,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | PaligemmaModelOutputWithPast: + ) -> PaligemmaModelOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -471,7 +471,7 @@ def forward( return_dict: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | PaliGemmaCausalLMOutputWithPast: + ) -> PaliGemmaCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/pe_audio/modeling_pe_audio.py b/src/transformers/models/pe_audio/modeling_pe_audio.py index 5f58130e2146..2e271da71625 100644 --- a/src/transformers/models/pe_audio/modeling_pe_audio.py +++ b/src/transformers/models/pe_audio/modeling_pe_audio.py @@ -643,7 +643,7 @@ def forward( input_values: torch.Tensor, padding_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: inputs_embeds, padding_mask = self.embedder(input_values, padding_mask=padding_mask) inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask) diff --git a/src/transformers/models/pe_audio/modular_pe_audio.py b/src/transformers/models/pe_audio/modular_pe_audio.py index 8233367a3e37..b8b216489b1f 100644 --- a/src/transformers/models/pe_audio/modular_pe_audio.py +++ b/src/transformers/models/pe_audio/modular_pe_audio.py @@ -119,7 +119,7 @@ def forward( input_values: torch.Tensor, padding_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: inputs_embeds, padding_mask = self.embedder(input_values, padding_mask=padding_mask) inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask) diff --git a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py index a76e1b40f27c..01ce3f287555 100644 --- a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py +++ b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py @@ -592,7 +592,7 @@ def forward( padding_mask: torch.Tensor | None = None, padding_mask_videos: torch.Tensor | None = None, **kwargs, - ) -> tuple | PeAudioVideoEncoderOutput: + ) -> PeAudioVideoEncoderOutput: inputs_embeds, padding_mask, audio_output, video_output = self.embedder( input_values, pixel_values_videos, diff --git a/src/transformers/models/pe_audio_video/modular_pe_audio_video.py b/src/transformers/models/pe_audio_video/modular_pe_audio_video.py index c8d603a3e6eb..26809df7d2c4 100644 --- a/src/transformers/models/pe_audio_video/modular_pe_audio_video.py +++ b/src/transformers/models/pe_audio_video/modular_pe_audio_video.py @@ -382,7 +382,7 @@ def forward( padding_mask: torch.Tensor | None = None, padding_mask_videos: torch.Tensor | None = None, **kwargs, - ) -> tuple | PeAudioVideoEncoderOutput: + ) -> PeAudioVideoEncoderOutput: inputs_embeds, padding_mask, audio_output, video_output = self.embedder( input_values, pixel_values_videos, diff --git a/src/transformers/models/pe_video/modeling_pe_video.py b/src/transformers/models/pe_video/modeling_pe_video.py index fbc32ed0983d..bfecc99cd7fc 100644 --- a/src/transformers/models/pe_video/modeling_pe_video.py +++ b/src/transformers/models/pe_video/modeling_pe_video.py @@ -527,7 +527,7 @@ def forward( pixel_values_videos: torch.Tensor, padding_mask_videos: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: inputs_embeds, padding_mask = self.embedder(pixel_values_videos, padding_mask=padding_mask_videos) inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask) @@ -582,7 +582,7 @@ def get_text_features( input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, @@ -599,7 +599,7 @@ def get_video_features( pixel_values_videos: torch.Tensor, padding_mask_videos: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: video_outputs: BaseModelOutputWithPooling = self.video_encoder( pixel_values_videos=pixel_values_videos, padding_mask_videos=padding_mask_videos, diff --git a/src/transformers/models/pe_video/modular_pe_video.py b/src/transformers/models/pe_video/modular_pe_video.py index 2c7764e6fe25..e699da3c6f11 100644 --- a/src/transformers/models/pe_video/modular_pe_video.py +++ b/src/transformers/models/pe_video/modular_pe_video.py @@ -109,7 +109,7 @@ def forward( pixel_values_videos: torch.Tensor, padding_mask_videos: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: inputs_embeds, padding_mask = self.embedder(pixel_values_videos, padding_mask=padding_mask_videos) inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask) @@ -164,7 +164,7 @@ def get_text_features( input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, @@ -181,7 +181,7 @@ def get_video_features( pixel_values_videos: torch.Tensor, padding_mask_videos: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: video_outputs: BaseModelOutputWithPooling = self.video_encoder( pixel_values_videos=pixel_values_videos, padding_mask_videos=padding_mask_videos, diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py index 7957d3bbd143..fe656acc5fde 100644 --- a/src/transformers/models/perception_lm/modeling_perception_lm.py +++ b/src/transformers/models/perception_lm/modeling_perception_lm.py @@ -187,7 +187,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values.flatten(0, 1), return_dict=True, **kwargs) last_hidden_state = image_outputs.last_hidden_state if self.config.vision_use_cls_token: @@ -255,7 +255,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | PerceptionLMModelOutputWithPast: + ) -> PerceptionLMModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -349,7 +349,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | PerceptionLMCausalLMOutputWithPast: + ) -> PerceptionLMCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py index e2d2df573720..704290077d6d 100644 --- a/src/transformers/models/perception_lm/modular_perception_lm.py +++ b/src/transformers/models/perception_lm/modular_perception_lm.py @@ -155,7 +155,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values.flatten(0, 1), return_dict=True, **kwargs) last_hidden_state = image_outputs.last_hidden_state if self.config.vision_use_cls_token: @@ -223,7 +223,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | PerceptionLMModelOutputWithPast: + ) -> PerceptionLMModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -336,7 +336,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | PerceptionLMCausalLMOutputWithPast: + ) -> PerceptionLMCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index a62f70d5ea0f..d428b67da295 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -416,7 +416,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: batch_size = pixel_values.size(0) if patch_attention_mask is None: patch_attention_mask = torch.ones( @@ -1531,7 +1531,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" image_pixel_values (`torch.FloatTensor`, *optional*): If the input contains images, these correspond to the pixel values after transformations (as returned by diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index 4e96909c8513..f950a5ba0614 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -690,7 +690,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: batch_size = pixel_values.size(0) if patch_attention_mask is None: patch_attention_mask = torch.ones( @@ -1502,7 +1502,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" image_pixel_values (`torch.FloatTensor`, *optional*): If the input contains images, these correspond to the pixel values after transformations (as returned by diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py index 7558dafca6d0..d4a70c090786 100644 --- a/src/transformers/models/pixtral/modeling_pixtral.py +++ b/src/transformers/models/pixtral/modeling_pixtral.py @@ -487,7 +487,7 @@ def forward( return_dict: bool | None = None, *args, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: if image_sizes is None: batch_size, _, height, width = pixel_values.shape image_sizes = [(height, width)] * batch_size diff --git a/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py b/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py index d4c275b93eed..17e8e6f34a0b 100644 --- a/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +++ b/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py @@ -1150,7 +1150,7 @@ def forward( norm=None, mask_feat=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> PPDocLayoutV3DecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1655,7 +1655,7 @@ def forward( encoder_outputs: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ModelOutput: + ) -> PPDocLayoutV3ModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1977,7 +1977,7 @@ def forward( encoder_outputs: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ForObjectDetectionOutput: + ) -> PPDocLayoutV3ForObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py b/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py index a4b2536c4096..4b3e3533b6ea 100644 --- a/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +++ b/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py @@ -975,7 +975,7 @@ def forward( norm=None, mask_feat=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> PPDocLayoutV3DecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1113,7 +1113,7 @@ def forward( encoder_outputs: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ModelOutput: + ) -> PPDocLayoutV3ModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1432,7 +1432,7 @@ def forward( encoder_outputs: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ForObjectDetectionOutput: + ) -> PPDocLayoutV3ForObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index e48498984637..d12d6af5431f 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -790,7 +790,9 @@ def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch @merge_with_config_defaults @capture_outputs(tie_last_hidden_states=False) @auto_docstring - def forward(self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs]): + def forward( + self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs] + ) -> BaseModelOutputWithPooling: r""" feature_lens (`torch.LongTensor` of shape `(batch_size,)`): mel length @@ -1223,7 +1225,7 @@ def get_window_index(self, grid_thw): @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -1775,7 +1777,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1792,7 +1794,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1810,7 +1812,7 @@ def get_audio_features( feature_attention_mask: torch.LongTensor | None = None, audio_feature_lengths: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): The tensors corresponding to the input audios. diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 027189f919ad..1999b68a37fc 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -1681,7 +1681,9 @@ def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch @merge_with_config_defaults @capture_outputs(tie_last_hidden_states=False) @auto_docstring - def forward(self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs]): + def forward( + self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs] + ) -> BaseModelOutputWithPooling: r""" feature_lens (`torch.LongTensor` of shape `(batch_size,)`): mel length @@ -1921,7 +1923,7 @@ def __init__(self, config: Qwen2_5OmniVisionEncoderConfig, *inputs, **kwargs) -> @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -2069,7 +2071,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -2086,7 +2088,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -2104,7 +2106,7 @@ def get_audio_features( feature_attention_mask: torch.LongTensor | None = None, audio_feature_lengths: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): The tensors corresponding to the input audios. diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 6305902014ce..83a68af48eac 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -447,7 +447,7 @@ def get_window_index(self, grid_thw): @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -1192,7 +1192,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1214,7 +1214,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1515,7 +1515,7 @@ def forward( second_per_grid_ts: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen2_5_VLCausalLMOutputWithPast: + ) -> Qwen2_5_VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 093b4f2b783a..e2488f8962f0 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -294,7 +294,7 @@ def get_window_index(self, grid_thw): @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -651,7 +651,7 @@ def forward( second_per_grid_ts: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen2_5_VLCausalLMOutputWithPast: + ) -> Qwen2_5_VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 3a403942fc18..f14ea866f1d8 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1158,7 +1158,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1180,7 +1180,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1443,7 +1443,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen2VLCausalLMOutputWithPast: + ) -> Qwen2VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_5/modeling_qwen3_5.py b/src/transformers/models/qwen3_5/modeling_qwen3_5.py index 2a49fcb82d67..4b16dbac2142 100644 --- a/src/transformers/models/qwen3_5/modeling_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modeling_qwen3_5.py @@ -1591,7 +1591,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1608,7 +1608,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1726,7 +1726,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5ModelOutputWithPast: + ) -> Qwen3_5ModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1969,7 +1969,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5CausalLMOutputWithPast: + ) -> Qwen3_5CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py index cae6a0a7d383..a0715dedf1c9 100644 --- a/src/transformers/models/qwen3_5/modular_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py @@ -697,7 +697,7 @@ class Qwen3_5Model(Qwen3VLModel): def get_video_features( self, **super_kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: # Same implementation as for images return super().get_video_features(**super_kwargs) @@ -706,7 +706,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: pixel_values = pixel_values.type(self.visual.dtype) vision_output: BaseModelOutputWithPooling = self.visual( pixel_values, grid_thw=image_grid_thw, return_dict=True, **kwargs @@ -734,7 +734,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5ModelOutputWithPast: + ) -> Qwen3_5ModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. diff --git a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py index 87f52dc8b651..1f30f6df0a15 100644 --- a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py +++ b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py @@ -1716,7 +1716,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1733,7 +1733,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1851,7 +1851,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5MoeModelOutputWithPast: + ) -> Qwen3_5MoeModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -2171,7 +2171,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5MoeCausalLMOutputWithPast: + ) -> Qwen3_5MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index 4a4d8a5029be..1ddd10966759 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -704,7 +704,7 @@ def forward( feature_lens=None, aftercnn_lens=None, **kwargs, - ): + ) -> BaseModelOutputWithPooling: r""" feature_lens (`torch.LongTensor` of shape `(batch_size,)`): mel length @@ -1192,7 +1192,7 @@ def fast_pos_embed_interpolate(self, grid_thw): @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -1710,7 +1710,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -1941,7 +1941,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1958,7 +1958,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1976,7 +1976,7 @@ def get_audio_features( feature_attention_mask: torch.LongTensor | None = None, audio_feature_lengths: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): The tensors corresponding to the input audios. @@ -2075,7 +2075,7 @@ def forward( cache_position=None, video_second_per_grid=None, **kwargs, - ) -> tuple | Qwen3OmniMoeThinkerCausalLMOutputWithPast: + ) -> Qwen3OmniMoeThinkerCausalLMOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -2971,7 +2971,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index ff45007a2696..5dbe7005bdaf 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -1465,7 +1465,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1482,7 +1482,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1500,7 +1500,7 @@ def get_audio_features( feature_attention_mask: torch.LongTensor | None = None, audio_feature_lengths: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): The tensors corresponding to the input audios. @@ -1549,7 +1549,7 @@ def forward( cache_position=None, video_second_per_grid=None, **kwargs, - ) -> tuple | Qwen3OmniMoeThinkerCausalLMOutputWithPast: + ) -> Qwen3OmniMoeThinkerCausalLMOutputWithPast: output_router_logits = ( output_router_logits if output_router_logits is not None else self.config.text_config.output_router_logits ) diff --git a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 8e5ab18be6a2..d761fb832674 100644 --- a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -764,7 +764,7 @@ def fast_pos_embed_interpolate(self, grid_thw): @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -864,7 +864,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -1145,7 +1145,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1162,7 +1162,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1280,7 +1280,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLModelOutputWithPast: + ) -> Qwen3VLModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1471,7 +1471,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLCausalLMOutputWithPast: + ) -> Qwen3VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index e791940ea965..6f5c4ca110f4 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -662,7 +662,7 @@ def fast_pos_embed_interpolate(self, grid_thw): @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -760,7 +760,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -859,7 +859,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -884,7 +884,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -910,7 +910,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLModelOutputWithPast: + ) -> Qwen3VLModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1036,7 +1036,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLCausalLMOutputWithPast: + ) -> Qwen3VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index b3d8e371f84a..9fc9658fefce 100644 --- a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -748,7 +748,7 @@ def fast_pos_embed_interpolate(self, grid_thw): @capture_outputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -937,7 +937,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -1274,7 +1274,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1291,7 +1291,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1409,7 +1409,7 @@ def forward( mm_token_type_ids: torch.IntTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLMoeModelOutputWithPast: + ) -> Qwen3VLMoeModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1653,7 +1653,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLMoeCausalLMOutputWithPast: + ) -> Qwen3VLMoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index 6033e30bbfb6..701bf3c6ca0b 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -355,7 +355,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -462,7 +462,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLMoeCausalLMOutputWithPast: + ) -> Qwen3VLMoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index f689dd63cc74..fd28fd050064 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -607,7 +607,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -752,7 +752,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -862,7 +862,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -958,7 +958,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1042,7 +1042,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1143,7 +1143,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1233,7 +1233,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/roberta/modular_roberta.py b/src/transformers/models/roberta/modular_roberta.py index ef1641fbccd7..84761658b9b0 100644 --- a/src/transformers/models/roberta/modular_roberta.py +++ b/src/transformers/models/roberta/modular_roberta.py @@ -225,7 +225,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -335,7 +335,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -431,7 +431,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -515,7 +515,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -616,7 +616,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -706,7 +706,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index 9cbfa8b26292..1b4f7b2c9b71 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -620,7 +620,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -782,7 +782,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -898,7 +898,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -996,7 +996,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1081,7 +1081,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1183,7 +1183,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1275,7 +1275,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index d3111ebfdba4..5ead02ad33d1 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -693,7 +693,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -859,7 +859,7 @@ def forward( labels_attention_mask: torch.Tensor | None = None, labels_token_type_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1055,7 +1055,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1179,7 +1179,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1316,7 +1316,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1414,7 +1414,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1538,7 +1538,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1614,7 +1614,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py index 182d4b2c054a..abd8f0bf463d 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr.py @@ -1165,7 +1165,7 @@ def forward( spatial_shapes_list=None, level_start_index=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> RTDetrDecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1490,7 +1490,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrModelOutput: + ) -> RTDetrModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1715,7 +1715,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrObjectDetectionOutput: + ) -> RTDetrObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py index f9289f9e6619..7d9cffa1b6b2 100644 --- a/src/transformers/models/rt_detr/modular_rt_detr.py +++ b/src/transformers/models/rt_detr/modular_rt_detr.py @@ -1274,7 +1274,7 @@ def forward( spatial_shapes_list=None, level_start_index=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> RTDetrDecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1476,7 +1476,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrModelOutput: + ) -> RTDetrModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1701,7 +1701,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrObjectDetectionOutput: + ) -> RTDetrObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py index b5244ffda7f8..13fd3c87dbf1 100644 --- a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py @@ -588,7 +588,7 @@ def forward( spatial_shapes_list=None, level_start_index=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> RTDetrV2DecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1413,7 +1413,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrV2ModelOutput: + ) -> RTDetrV2ModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1747,7 +1747,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrV2ObjectDetectionOutput: + ) -> RTDetrV2ObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index 7638a2a8f8c0..00f5d83cecc3 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -1057,7 +1057,7 @@ def get_input_embeddings(self): @capture_outputs(tie_last_hidden_states=False) def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | SamVisionEncoderOutput: + ) -> SamVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/sam2/modeling_sam2.py b/src/transformers/models/sam2/modeling_sam2.py index cf598e4c49d9..477e84a28d45 100644 --- a/src/transformers/models/sam2/modeling_sam2.py +++ b/src/transformers/models/sam2/modeling_sam2.py @@ -634,7 +634,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2HieraDetModelOutput: + ) -> Sam2HieraDetModelOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -686,7 +686,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VisionEncoderOutput: + ) -> Sam2VisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1577,7 +1577,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VisionEncoderOutput: + ) -> Sam2VisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py index 5ecdbfa9fafa..2c821c5d3f1c 100644 --- a/src/transformers/models/sam2/modular_sam2.py +++ b/src/transformers/models/sam2/modular_sam2.py @@ -745,7 +745,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2HieraDetModelOutput: + ) -> Sam2HieraDetModelOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -797,7 +797,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VisionEncoderOutput: + ) -> Sam2VisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1257,7 +1257,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VisionEncoderOutput: + ) -> Sam2VisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam2_video/modeling_sam2_video.py b/src/transformers/models/sam2_video/modeling_sam2_video.py index 0f20ca5c75dc..ed937bd46fb4 100644 --- a/src/transformers/models/sam2_video/modeling_sam2_video.py +++ b/src/transformers/models/sam2_video/modeling_sam2_video.py @@ -1841,7 +1841,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VideoVisionEncoderOutput: + ) -> Sam2VideoVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam3/modeling_sam3.py b/src/transformers/models/sam3/modeling_sam3.py index daad1e4fd517..d9b571a5b40c 100644 --- a/src/transformers/models/sam3/modeling_sam3.py +++ b/src/transformers/models/sam3/modeling_sam3.py @@ -1037,7 +1037,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3VisionEncoderOutput: + ) -> Sam3VisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1408,7 +1408,7 @@ def forward( text_mask: torch.Tensor | None = None, spatial_sizes: list[tuple[int, int]] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3DETREncoderOutput: + ) -> Sam3DETREncoderOutput: """ Forward pass for the DETR encoder. @@ -1704,7 +1704,7 @@ def forward( text_mask: torch.Tensor | None = None, spatial_shapes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3DETRDecoderOutput: + ) -> Sam3DETRDecoderOutput: """ Forward pass for the DETR decoder. @@ -2021,7 +2021,7 @@ def forward( prompt_features: torch.Tensor | None = None, prompt_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3MaskDecoderOutput: + ) -> Sam3MaskDecoderOutput: """ Args: decoder_queries: Decoder output queries [batch_size, num_queries, hidden_size] @@ -2158,7 +2158,7 @@ def get_text_features( input_ids: torch.LongTensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Example: diff --git a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py index 651a8551bb92..a67b3121cec8 100644 --- a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py +++ b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py @@ -1078,7 +1078,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3TrackerVisionEncoderOutput: + ) -> Sam3TrackerVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py index 3bd55a0bb0ba..2a3f1bfac396 100644 --- a/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +++ b/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py @@ -1864,7 +1864,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3TrackerVideoVisionEncoderOutput: + ) -> Sam3TrackerVideoVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py index 1d357fe923b0..44e46de7f37c 100644 --- a/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +++ b/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py @@ -550,7 +550,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3TrackerVideoVisionEncoderOutput: + ) -> Sam3TrackerVideoVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam_hq/modeling_sam_hq.py b/src/transformers/models/sam_hq/modeling_sam_hq.py index 83e558989b69..a8e98ae30ef6 100644 --- a/src/transformers/models/sam_hq/modeling_sam_hq.py +++ b/src/transformers/models/sam_hq/modeling_sam_hq.py @@ -556,7 +556,7 @@ def get_input_embeddings(self): @capture_outputs(tie_last_hidden_states=False) def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | SamHQVisionEncoderOutput: + ) -> SamHQVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/sam_hq/modular_sam_hq.py b/src/transformers/models/sam_hq/modular_sam_hq.py index e14627b71683..93e76495f1d3 100644 --- a/src/transformers/models/sam_hq/modular_sam_hq.py +++ b/src/transformers/models/sam_hq/modular_sam_hq.py @@ -192,7 +192,7 @@ class SamHQVisionEncoder(SamVisionEncoder, SamHQPreTrainedModel): @capture_outputs(tie_last_hidden_states=False) def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | SamHQVisionEncoderOutput: + ) -> SamHQVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index a2b8778447ba..9196bd22d327 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -762,7 +762,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -792,7 +792,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py index 9c6e5569b29c..d7bca0993509 100644 --- a/src/transformers/models/siglip2/modeling_siglip2.py +++ b/src/transformers/models/siglip2/modeling_siglip2.py @@ -840,7 +840,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -871,7 +871,7 @@ def get_image_features( pixel_attention_mask: torch.Tensor | None = None, spatial_shapes: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/siglip2/modular_siglip2.py b/src/transformers/models/siglip2/modular_siglip2.py index a12f76e67910..cf437000677d 100644 --- a/src/transformers/models/siglip2/modular_siglip2.py +++ b/src/transformers/models/siglip2/modular_siglip2.py @@ -438,7 +438,7 @@ def get_image_features( pixel_attention_mask: torch.Tensor | None = None, spatial_shapes: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index 3c80c3aa7e41..780331751404 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -350,7 +350,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: batch_size = pixel_values.size(0) if patch_attention_mask is None: patch_size = self.patch_size @@ -535,7 +535,7 @@ def get_image_features( pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -610,7 +610,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | SmolVLMBaseModelOutputWithPast: + ) -> SmolVLMBaseModelOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. @@ -780,7 +780,7 @@ def forward( return_dict: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SmolVLMCausalLMOutputWithPast: + ) -> SmolVLMCausalLMOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/smolvlm/modular_smolvlm.py b/src/transformers/models/smolvlm/modular_smolvlm.py index 179b809c6b9d..d427c585649d 100644 --- a/src/transformers/models/smolvlm/modular_smolvlm.py +++ b/src/transformers/models/smolvlm/modular_smolvlm.py @@ -201,7 +201,7 @@ def get_image_features( pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -276,7 +276,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | SmolVLMBaseModelOutputWithPast: + ) -> SmolVLMBaseModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index fdbd136acc68..b8908e9035c7 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -297,7 +297,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -375,7 +375,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index bd97e1c9ee91..254895bc119e 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -363,7 +363,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/starcoder2/modular_starcoder2.py b/src/transformers/models/starcoder2/modular_starcoder2.py index 6d0f2b616691..4dd3c1abdcbc 100644 --- a/src/transformers/models/starcoder2/modular_starcoder2.py +++ b/src/transformers/models/starcoder2/modular_starcoder2.py @@ -154,7 +154,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index d0949ff6d389..bfd43cebb882 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -680,7 +680,7 @@ def forward( use_cache=None, cache_position=None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoEModelOutputWithPastAndCrossAttentions: + ) -> MoEModelOutputWithPastAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -814,7 +814,7 @@ def forward( decoder_inputs_embeds: torch.Tensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqMoEModelOutput: + ) -> Seq2SeqMoEModelOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs @@ -969,7 +969,7 @@ def forward( output_router_logits: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqMoEOutput: + ) -> Seq2SeqMoEOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, @@ -1107,7 +1107,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | MoEModelOutput: + ) -> MoEModelOutput: use_cache = False encoder_outputs = self.encoder( input_ids=input_ids, diff --git a/src/transformers/models/switch_transformers/modular_switch_transformers.py b/src/transformers/models/switch_transformers/modular_switch_transformers.py index dcfe061da793..8bbe555cbed5 100644 --- a/src/transformers/models/switch_transformers/modular_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modular_switch_transformers.py @@ -444,7 +444,7 @@ def forward( use_cache=None, cache_position=None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoEModelOutputWithPastAndCrossAttentions: + ) -> MoEModelOutputWithPastAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -578,7 +578,7 @@ def forward( decoder_inputs_embeds: torch.Tensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqMoEModelOutput: + ) -> Seq2SeqMoEModelOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs @@ -668,7 +668,7 @@ def forward( output_router_logits: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqMoEOutput: + ) -> Seq2SeqMoEOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, @@ -806,7 +806,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | MoEModelOutput: + ) -> MoEModelOutput: use_cache = False encoder_outputs = self.encoder( input_ids=input_ids, diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index 2091cf421dfd..5051fa9c0738 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -692,7 +692,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -792,7 +792,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") if encoder_hidden_states is None: @@ -1033,7 +1033,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index 7cff24b84793..56fe1acd6015 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -685,7 +685,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -785,7 +785,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") if encoder_hidden_states is None: @@ -1026,7 +1026,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, diff --git a/src/transformers/models/t5gemma2/modeling_t5gemma2.py b/src/transformers/models/t5gemma2/modeling_t5gemma2.py index a022d5a9436f..3c255decd994 100644 --- a/src/transformers/models/t5gemma2/modeling_t5gemma2.py +++ b/src/transformers/models/t5gemma2/modeling_t5gemma2.py @@ -891,7 +891,7 @@ def set_input_embeddings(self, new_embeddings): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: # pixel_values: (batch_size, channels, height, width) # image_features: Image feature tensor of shape (num_images, image_length, embed_dim). vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) @@ -1256,7 +1256,7 @@ def get_decoder(self): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: return self.get_encoder().get_image_features(pixel_values, **kwargs) @property @@ -1286,7 +1286,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, diff --git a/src/transformers/models/t5gemma2/modular_t5gemma2.py b/src/transformers/models/t5gemma2/modular_t5gemma2.py index 3ccdb9d672d3..68b84b524288 100644 --- a/src/transformers/models/t5gemma2/modular_t5gemma2.py +++ b/src/transformers/models/t5gemma2/modular_t5gemma2.py @@ -922,7 +922,7 @@ def set_input_embeddings(self, new_embeddings): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: # pixel_values: (batch_size, channels, height, width) # image_features: Image feature tensor of shape (num_images, image_length, embed_dim). vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) @@ -1274,7 +1274,7 @@ def get_decoder(self): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: return self.get_encoder().get_image_features(pixel_values, **kwargs) @property @@ -1304,7 +1304,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py index c7b5490ec050..188bf35701ec 100644 --- a/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py +++ b/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py @@ -501,7 +501,9 @@ def __init__(self, config): @can_return_tuple @auto_docstring - def encode(self, input_values, padding_cache=None, use_cache=None, sample=True): + def encode( + self, input_values, padding_cache=None, use_cache=None, sample=True + ) -> VibeVoiceAcousticTokenizerEncoderOutput: r""" input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input audio waveform to be encoded into latent representation. @@ -527,7 +529,7 @@ def encode(self, input_values, padding_cache=None, use_cache=None, sample=True): @can_return_tuple @auto_docstring - def decode(self, latents, padding_cache=None, use_cache=False): + def decode(self, latents, padding_cache=None, use_cache=False) -> VibeVoiceAcousticTokenizerDecoderOutput: r""" latents (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input latent representation to be decoded back into audio. @@ -541,7 +543,9 @@ def decode(self, latents, padding_cache=None, use_cache=False): @can_return_tuple @auto_docstring - def forward(self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs): + def forward( + self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs + ) -> VibeVoiceAcousticTokenizerOutput: r""" input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input audio waveform to be encoded into latent representation. diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py index 85f97e75dbc5..f8f590c92b8b 100644 --- a/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py +++ b/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py @@ -423,7 +423,9 @@ def __init__(self, config): @can_return_tuple @auto_docstring - def encode(self, input_values, padding_cache=None, use_cache=None, sample=True): + def encode( + self, input_values, padding_cache=None, use_cache=None, sample=True + ) -> VibeVoiceAcousticTokenizerEncoderOutput: r""" input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input audio waveform to be encoded into latent representation. @@ -449,7 +451,7 @@ def encode(self, input_values, padding_cache=None, use_cache=None, sample=True): @can_return_tuple @auto_docstring - def decode(self, latents, padding_cache=None, use_cache=False): + def decode(self, latents, padding_cache=None, use_cache=False) -> VibeVoiceAcousticTokenizerDecoderOutput: r""" latents (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input latent representation to be decoded back into audio. @@ -463,7 +465,9 @@ def decode(self, latents, padding_cache=None, use_cache=False): @can_return_tuple @auto_docstring - def forward(self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs): + def forward( + self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs + ) -> VibeVoiceAcousticTokenizerOutput: r""" input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input audio waveform to be encoded into latent representation. diff --git a/src/transformers/models/video_llama_3/modeling_video_llama_3.py b/src/transformers/models/video_llama_3/modeling_video_llama_3.py index 5f626efb7d39..eabcdf632101 100644 --- a/src/transformers/models/video_llama_3/modeling_video_llama_3.py +++ b/src/transformers/models/video_llama_3/modeling_video_llama_3.py @@ -351,7 +351,7 @@ def forward( cu_seqlens: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" cu_seqlens (`torch.Tensor` of shape `(num_images_or_videos + 1,)`): The cumulative sequence lengths of each image or video feature. @@ -445,7 +445,7 @@ def forward( grid_thw: torch.Tensor, merge_sizes: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" grid_thw (`torch.LongTensor` of shape `(num_images_or_videos, 3)`): The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values. @@ -555,7 +555,7 @@ def get_video_features( video_grid_thw: torch.LongTensor, video_merge_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -579,7 +579,7 @@ def get_image_features( image_grid_thw: torch.LongTensor, image_merge_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -663,7 +663,7 @@ def forward( video_compression_mask: torch.BoolTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlama3ModelOutputWithPast: + ) -> VideoLlama3ModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -830,7 +830,7 @@ def forward( video_compression_mask: torch.BoolTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlama3CausalLMOutputWithPast: + ) -> VideoLlama3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/video_llama_3/modular_video_llama_3.py b/src/transformers/models/video_llama_3/modular_video_llama_3.py index 9ea9b1fcb370..c553e600d022 100644 --- a/src/transformers/models/video_llama_3/modular_video_llama_3.py +++ b/src/transformers/models/video_llama_3/modular_video_llama_3.py @@ -417,7 +417,7 @@ def forward( cu_seqlens: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" cu_seqlens (`torch.Tensor` of shape `(num_images_or_videos + 1,)`): The cumulative sequence lengths of each image or video feature. @@ -501,7 +501,7 @@ def forward( grid_thw: torch.Tensor, merge_sizes: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" grid_thw (`torch.LongTensor` of shape `(num_images_or_videos, 3)`): The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values. @@ -610,7 +610,7 @@ def get_video_features( video_grid_thw: torch.LongTensor, video_merge_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -634,7 +634,7 @@ def get_image_features( image_grid_thw: torch.LongTensor, image_merge_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -677,7 +677,7 @@ def forward( video_compression_mask: torch.BoolTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlama3ModelOutputWithPast: + ) -> VideoLlama3ModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -801,7 +801,7 @@ def forward( video_compression_mask: torch.BoolTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlama3CausalLMOutputWithPast: + ) -> VideoLlama3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 71254e964a91..247d70d12bc3 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -182,7 +182,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_images (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. @@ -230,7 +230,7 @@ def get_video_features( vision_feature_layer: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input videos. @@ -322,7 +322,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | VideoLlavaModelOutputWithPast: + ) -> VideoLlavaModelOutputWithPast: r""" pixel_values_images (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images. Pixel values can be obtained using @@ -464,7 +464,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlavaCausalLMOutputWithPast: + ) -> VideoLlavaCausalLMOutputWithPast: r""" pixel_values_images (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images. Pixel values can be obtained using diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 1c07c5302995..5a03f7348658 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -160,7 +160,7 @@ def get_image_features( vision_feature_layers: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -351,7 +351,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | VipLlavaCausalLMOutputWithPast: + ) -> VipLlavaCausalLMOutputWithPast: r""" vision_feature_layers (`Union[int, list[int]]`, *optional*): The vision feature layer, or the list of indexes of the layers to select diff --git a/src/transformers/models/vipllava/modular_vipllava.py b/src/transformers/models/vipllava/modular_vipllava.py index dd836e95bff3..2c613819b537 100644 --- a/src/transformers/models/vipllava/modular_vipllava.py +++ b/src/transformers/models/vipllava/modular_vipllava.py @@ -82,7 +82,7 @@ def get_image_features( vision_feature_layers: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -219,7 +219,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | VipLlavaCausalLMOutputWithPast: + ) -> VipLlavaCausalLMOutputWithPast: r""" vision_feature_layers (`Union[int, list[int]]`, *optional*): The vision feature layer, or the list of indexes of the layers to select diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index 12fcb924b3d3..d46264d1773d 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -111,7 +111,7 @@ def get_text_features( position_ids: torch.Tensor | None = None, token_type_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -143,7 +143,7 @@ def get_text_features( @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py index dfca3597f2c7..be3c5175e2c3 100644 --- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py @@ -397,7 +397,7 @@ def forward( dataset_index: torch.Tensor | None = None, output_hidden_states: bool | None = None, **kwargs, - ): + ) -> BackboneOutput: r""" dataset_index (`torch.Tensor` of shape `(batch_size,)`): Index to use in the Mixture-of-Experts (MoE) blocks of the backbone. diff --git a/src/transformers/models/vjepa2/modeling_vjepa2.py b/src/transformers/models/vjepa2/modeling_vjepa2.py index 2b638042b118..8afd689d4c4d 100644 --- a/src/transformers/models/vjepa2/modeling_vjepa2.py +++ b/src/transformers/models/vjepa2/modeling_vjepa2.py @@ -1087,7 +1087,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, **kwargs, - ) -> tuple | ImageClassifierOutput: + ) -> ImageClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., diff --git a/src/transformers/models/voxtral/modeling_voxtral.py b/src/transformers/models/voxtral/modeling_voxtral.py index e5759b18fea6..8ccc2d3aa113 100644 --- a/src/transformers/models/voxtral/modeling_voxtral.py +++ b/src/transformers/models/voxtral/modeling_voxtral.py @@ -298,7 +298,7 @@ def forward( input_features, attention_mask=None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Args: input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): @@ -405,7 +405,7 @@ def get_decoder(self): ) def get_audio_features( self, input_features: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/voxtral/modular_voxtral.py b/src/transformers/models/voxtral/modular_voxtral.py index b87f0e238e56..394aba27c05f 100644 --- a/src/transformers/models/voxtral/modular_voxtral.py +++ b/src/transformers/models/voxtral/modular_voxtral.py @@ -73,7 +73,7 @@ def forward( input_features, attention_mask=None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Args: input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): @@ -171,7 +171,7 @@ def get_decoder(self): ) def get_audio_features( self, input_features: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py index 6212b61bd2a7..e5b9d2228df0 100644 --- a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py +++ b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py @@ -544,7 +544,7 @@ def forward( use_padding_cache: bool | None = None, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" padding_cache (`VoxtralRealtimeConv1dPaddingCache`, *optional*): Cache for padding in convolutional layers to maintain state across streaming chunks. @@ -1000,7 +1000,7 @@ def get_audio_features( past_key_values: Cache | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py index 93fcfc48e212..4d85a1d672d3 100644 --- a/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py +++ b/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py @@ -315,7 +315,7 @@ def forward( use_padding_cache: bool | None = None, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" padding_cache (`VoxtralRealtimeConv1dPaddingCache`, *optional*): Cache for padding in convolutional layers to maintain state across streaming chunks. @@ -551,7 +551,7 @@ def get_audio_features( past_key_values: Cache | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index e85b8426aee3..110bd10c5d63 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -565,7 +565,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -1181,7 +1181,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1214,7 +1214,7 @@ def get_video_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index 67faea68874d..ccb6c237f601 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -618,7 +618,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -761,7 +761,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -870,7 +870,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -966,7 +966,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1050,7 +1050,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1151,7 +1151,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1219,7 +1219,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/xlm_roberta/modular_xlm_roberta.py b/src/transformers/models/xlm_roberta/modular_xlm_roberta.py index f9404c0d7993..024e0f376a79 100644 --- a/src/transformers/models/xlm_roberta/modular_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modular_xlm_roberta.py @@ -83,7 +83,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -177,7 +177,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -246,7 +246,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -326,7 +326,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -419,7 +419,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -483,7 +483,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index 92614bd6ecfe..c676a3ac1026 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -614,7 +614,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -803,7 +803,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -902,7 +902,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -964,7 +964,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1036,7 +1036,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -1127,7 +1127,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1192,7 +1192,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.roberta( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py index df51342415e9..208db351d138 100644 --- a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py @@ -303,7 +303,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -402,7 +402,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -464,7 +464,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -536,7 +536,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -627,7 +627,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -692,7 +692,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.roberta( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/xlstm/modeling_xlstm.py b/src/transformers/models/xlstm/modeling_xlstm.py index 52827267312e..8f025ab56eae 100644 --- a/src/transformers/models/xlstm/modeling_xlstm.py +++ b/src/transformers/models/xlstm/modeling_xlstm.py @@ -1414,7 +1414,7 @@ def forward( use_cache: bool | None = None, output_hidden_states: bool | None = None, **kwargs, - ) -> tuple | xLSTMOutput: + ) -> xLSTMOutput: r""" cache_params (`xLSTMCache`, *optional*): The xLSTMCache that carries the RNN states. @@ -1550,7 +1550,7 @@ def forward( use_cache: bool | None = None, output_hidden_states: bool | None = None, **kwargs, - ) -> tuple | xLSTMCausalLMOutput: + ) -> xLSTMCausalLMOutput: r""" cache_params (`xLSTMCache`, *optional*): The xLSTMCache that carries the RNN states. diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index 6e8c3d9d918d..20fbbe2b2382 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -720,7 +720,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -886,7 +886,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -996,7 +996,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -1088,7 +1088,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -1167,7 +1167,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1272,7 +1272,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -1358,7 +1358,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index 62593da73f1d..4f5b1c7f05f7 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -27,7 +27,7 @@ from dataclasses import fields, is_dataclass from enum import Enum from functools import partial, wraps -from typing import TYPE_CHECKING, Any, TypedDict +from typing import TYPE_CHECKING, Any, ParamSpec, TypedDict, TypeVar import numpy as np @@ -35,6 +35,13 @@ from .import_utils import is_mlx_available, is_torch_available, is_torch_fx_proxy +# Used to type hint decorators that modify the signature of the decorated function +P = ParamSpec("P") +T = TypeVar("T") + + +_CAN_RECORD_REGISTRY = {} + _is_torch_available = False if is_torch_available(): # required for @can_return_tuple decorator to work with torchdynamo @@ -825,22 +832,28 @@ def del_attribute_from_modules(module: nn.Module, key: str): del_attribute_from_modules(submodule, key) -def can_return_tuple(func): +# We follow the example from https://docs.python.org/3/library/typing.html#typing.ParamSpec to type-hint +# this decorator, allowing it to add 'tuple' to the signature of the decorated function. +def can_return_tuple(func: Callable[P, T]) -> Callable[P, tuple | T]: """ Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or use_return_dict=False is set in the config. + The wrapped method or function should not be typed like `tuple | X`, but instead just `X`, where `X` is the + original return type. This decorator's typing ensures that the return type is correctly represented as `tuple | X`. + Note: output.to_tuple() convert output to tuple skipping all `None` values. """ @wraps(func) - def wrapper(self, *args, **kwargs): + def wrapper(*args: P.args, **kwargs: P.kwargs) -> tuple | T: + self = args[0] return_dict = self.config.return_dict if hasattr(self, "config") else True return_dict_passed = kwargs.pop("return_dict", return_dict) if return_dict_passed is not None: return_dict = return_dict_passed - output = func(self, *args, **kwargs) + output = func(*args, **kwargs) if not return_dict and not isinstance(output, tuple): output = output.to_tuple() return output diff --git a/src/transformers/utils/output_capturing.py b/src/transformers/utils/output_capturing.py index 0aa4c0290e05..05eb6b40b5c8 100644 --- a/src/transformers/utils/output_capturing.py +++ b/src/transformers/utils/output_capturing.py @@ -19,10 +19,11 @@ from __future__ import annotations import threading +from collections.abc import Callable from contextvars import ContextVar from dataclasses import dataclass from functools import wraps -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, ParamSpec, TypeVar from .import_utils import is_torchdynamo_compiling, requires @@ -33,6 +34,12 @@ from ..modeling_utils import PreTrainedModel +# Used to type hint decorators that modify the signature of the decorated function + +P = ParamSpec("P") +T = TypeVar("T") + + _CAN_RECORD_REGISTRY = {} @@ -203,13 +210,18 @@ def maybe_install_capturing_hooks(model: PreTrainedModel) -> None: install_all_output_capturing_hooks(model) -def capture_outputs(func=None, *, tie_last_hidden_states=True): +# We follow the example from https://docs.python.org/3/library/typing.html#typing.ParamSpec to type-hint +# this decorator, allowing it to add 'tuple' to the signature of the decorated function. +def capture_outputs(func: Callable[P, T] | None = None, *, tie_last_hidden_states=True) -> Callable[P, tuple | T]: """ Decorator to intercept specific layer outputs through hooks. The hooks are installed only once and lazily, the first time output capture is requested with the `output_xxx` kwargs/config. The implementation is fully context/thread safe, except when using `torch.compile`, as dynamo is unable to trace through `ContextVar` methods. + The wrapped method or function should not be typed like `tuple | X`, but instead just `X`, where `X` is the + original return type. This decorator's typing ensures that the return type is correctly represented as `tuple | X`. + Args: tie_last_hidden_states (`bool`, *optional*, defaults to `True`): Whether to overwrite `out.hidden_states[-1]` with the `out.last_hidden_state`. @@ -218,9 +230,10 @@ def capture_outputs(func=None, *, tie_last_hidden_states=True): is needed for some vision models (e.g. CLIP, SigLIP) """ - def wrapped_fn(func): + def wrapped_fn(func: Callable[P, T]) -> Callable[P, tuple | T]: @wraps(func) - def wrapper(self, *args, **kwargs): + def wrapper(*args: P.args, **kwargs: P.kwargs) -> tuple | T: + self, *args = args # Pop it so that internal modules always return a dict even if False is requested return_dict = kwargs.pop("return_dict", getattr(self.config, "return_dict", True)) diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py new file mode 100644 index 000000000000..fe652b61142e --- /dev/null +++ b/utils/check_decorator_return_types.py @@ -0,0 +1,341 @@ +# Copyright 2026 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AST-based checks for decorators that modify return types. + +This script ensures that functions decorated with `can_return_tuple` or +`capture_outputs`: + +1. Have an explicit, non-`None` return annotation. +2. Are not annotated with a union that already includes `tuple`. + +The intention is that the decorators themselves are responsible for +adding the `tuple` part of the return type, so the underlying function +should be annotated with just the base return type. + +Usage (from the root of the repo): + +```bash +python utils/check_decorator_return_types.py +``` +""" + +from __future__ import annotations + +import argparse +import ast +import os +from collections.abc import Iterable +from dataclasses import dataclass + + +PATH_TO_TRANSFORMERS = "src/transformers/models" + + +TARGET_DECORATORS = {"can_return_tuple", "capture_outputs"} + + +@dataclass +class Violation: + file_path: str + line: int + function_name: str + decorator_name: str + message: str + + def format(self) -> str: + return ( + f"{self.file_path}:{self.line}: function '{self.function_name}' " + f"decorated with '@{self.decorator_name}' {self.message}" + ) + + +def _iter_python_files(root: str) -> Iterable[str]: + for dirpath, _, filenames in os.walk(root): + for filename in filenames: + if filename.endswith(".py"): + yield os.path.join(dirpath, filename) + + +def _decorator_name(node: ast.expr) -> str | None: + """Return the simple name of a decorator + + Handles forms like: + - @can_return_tuple + - @utils.can_return_tuple + - @can_return_tuple(...) + """ + + target = node.func if isinstance(node, ast.Call) else node + + if isinstance(target, ast.Name): + return target.id + elif isinstance(target, ast.Attribute): + return target.attr + return None + + +def _is_none_annotation(returns: ast.expr | None) -> bool: + return ( + returns is None + or isinstance(returns, ast.Constant) + and returns.value is None + or isinstance(returns, ast.Name) + and returns.id == "None" + ) + + +def _is_tuple_type(node: ast.AST) -> bool: + """Return True if the node represents a tuple type. + + We conservatively treat the following as tuple types: + - `tuple` + - `tuple[...]` + - `Tuple[...]` (from typing) + """ + + if isinstance(node, ast.Name) and node.id in {"tuple", "Tuple"}: + return True + + if isinstance(node, ast.Subscript): + value = node.value + if isinstance(value, ast.Name) and value.id in {"tuple", "Tuple"}: + return True + + return False + + +def _iter_union_members(node: ast.AST) -> Iterable[ast.AST]: + """Yield flattened members of a PEP 604-style union (X | Y | Z). + + For non-union nodes, yields the node itself once. + """ + + if isinstance(node, ast.BinOp) and isinstance(node.op, ast.BitOr): + yield from _iter_union_members(node.left) + yield from _iter_union_members(node.right) + else: + yield node + + +def _has_tuple_in_union(returns: ast.expr) -> bool: + members = list(_iter_union_members(returns)) + if len(members) <= 1: + # Not a union + return False + + return any(_is_tuple_type(member) for member in members) + + +def _is_delegating_to_super(func_node: ast.AST) -> bool: + """Return True if the function body starts with a super(...) delegation. + + We ignore functions whose first non-docstring statement is either: + - `return super(...` (possibly via an attribute like `super().foo(...)`), or + - `super(...` as a bare expression. + """ + + if not isinstance(func_node, (ast.FunctionDef, ast.AsyncFunctionDef)): + return False + + body = getattr(func_node, "body", []) + if not body: + return False + + # Skip an initial docstring expression if present. + first_stmt_idx = 0 + if ( + isinstance(body[0], ast.Expr) + and isinstance(body[0].value, ast.Constant) + and isinstance(body[0].value.value, str) + ): + first_stmt_idx = 1 + + if first_stmt_idx >= len(body): + return False + + first_stmt = body[first_stmt_idx] + if isinstance(first_stmt, ast.Return): + target = first_stmt.value + elif isinstance(first_stmt, ast.Expr): + target = first_stmt.value + else: + return False + + if target is None: + return False + + # Look for a super(...) call anywhere in the expression tree. + for node in ast.walk(target): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "super": + return True + + return False + + +def _collect_decorated_functions(tree: ast.AST) -> list[tuple[ast.AST, str]]: + """Return (function_node, decorator_name) pairs for targeted decorators.""" + + functions: list[tuple[ast.AST, str]] = [] + for node in ast.walk(tree): + if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + continue + if not node.decorator_list: + continue + for deco in node.decorator_list: + name = _decorator_name(deco) + if name in TARGET_DECORATORS: + functions.append((node, name)) + break + return functions + + +def _compute_line_offsets(source: str) -> list[int]: + """Return starting offset in the full string for each line (0-based).""" + + offsets = [0] + total = 0 + for line in source.splitlines(keepends=True): + total += len(line) + offsets.append(total) + return offsets + + +def _make_union_without_tuple(returns: ast.expr) -> str | None: + """Build a new union annotation string without any tuple-type members. + + Returns the new annotation expression as a string, or None if it cannot + be constructed (e.g. all members were tuple types). + """ + + members = [m for m in _iter_union_members(returns) if not _is_tuple_type(m)] + if not members: + return None + + # We rely on Python's built-in unparser (3.9+). + pieces = [ast.unparse(m) for m in members] + return " | ".join(pieces) + + +def check_decorator_return_types(overwrite: bool = False): + all_violations: list[Violation] = [] + unfixable_violations: list[Violation] = [] + + for file_path in _iter_python_files(PATH_TO_TRANSFORMERS): + with open(file_path, "r", encoding="utf-8") as f: + source = f.read() + + try: + tree = ast.parse(source, filename=file_path, type_comments=True) + except SyntaxError as e: + print(f"Skipping {file_path} due to SyntaxError: {e}") + continue + + functions = _collect_decorated_functions(tree) + if not functions: + continue + + fixes: list[tuple[int, int, str]] = [] # (start, end, new_text) + + for func_node, decorator_name in functions: + # Ignore trivial delegations like `return super(...` or `super(...`. + # We skip these as this happens sometimes in modular files with methods that inherit their return + # type from another architecture. Then they'll have no explicit return type, but we'll test via + # the generated modeling file instead. + if _is_delegating_to_super(func_node): + continue + + returns = func_node.returns + + # 1. Must have a non-None return annotation. + if _is_none_annotation(returns): + v = Violation( + file_path=file_path, + line=func_node.lineno, + function_name=func_node.name, + decorator_name=decorator_name, + message="must have a non-None return annotation", + ) + all_violations.append(v) + unfixable_violations.append(v) + continue + + # Nothing else to do without an annotation. + if returns is None: + continue + + # 2. Annotation must not already be a union including `tuple`. + if _has_tuple_in_union(returns): + v = Violation( + file_path=file_path, + line=func_node.lineno, + function_name=func_node.name, + decorator_name=decorator_name, + message="must not be annotated with a union that includes 'tuple'", + ) + all_violations.append(v) + + if not overwrite: + continue + + new_annotation = _make_union_without_tuple(returns) + if new_annotation is None: + unfixable_violations.append(v) + continue + + # Use precise offsets to replace just the annotation. + if not hasattr(returns, "lineno") or not hasattr(returns, "end_lineno"): + unfixable_violations.append(v) + continue + + line_offsets = _compute_line_offsets(source) + try: + start = line_offsets[returns.lineno - 1] + returns.col_offset + end = line_offsets[returns.end_lineno - 1] + returns.end_col_offset + except IndexError: + unfixable_violations.append(v) + continue + + fixes.append((start, end, new_annotation)) + + if overwrite and fixes: + # Apply fixes from the end of the file backwards so offsets stay valid. + fixes.sort(key=lambda x: x[0], reverse=True) + new_source = source + for start, end, text in fixes: + new_source = new_source[:start] + text + new_source[end:] + + if new_source != source: + print(f"Updating return annotations in {file_path} to drop 'tuple' from unions.") + with open(file_path, "w", encoding="utf-8", newline="\n") as f: + f.write(new_source) + + if all_violations and not overwrite: + header = "Found decorator return-type violations:\n\n" + body = "\n".join(v.format() for v in all_violations) + footer = "\n\nRun this script with --fix_and_overwrite to auto-fix some violations." + raise ValueError(header + body + footer) + + if overwrite and unfixable_violations: + header = "Found decorator return-type violations that could not be auto-fixed:\n\n" + body = "\n".join(v.format() for v in unfixable_violations) + footer = "\n\nPlease fix these annotations manually." + raise ValueError(header + body + footer) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.") + args = parser.parse_args() + + check_decorator_return_types(args.fix_and_overwrite)