From bffa96344e81a7338be53c716476d80538dc8786 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 13:36:44 +0100 Subject: [PATCH 01/20] Automatically update can_return_tuple/check_model_inputs wrapped return type This updates the typings of these two functions, so that a wrapped function that has return type X is automatically typed as `tuple | X`. --- src/transformers/utils/generic.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index e530b5f9acc8..b459091afb68 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -25,14 +25,19 @@ from dataclasses import dataclass, fields, is_dataclass from enum import Enum from functools import partial, wraps -from typing import Any, Optional, TypedDict +from typing import Any, Optional, TypedDict, TypeVar import numpy as np +from typing_extensions import ParamSpec from ..utils import logging from .import_utils import is_mlx_available, is_torch_available, is_torch_fx_proxy, requires +P = ParamSpec("P") +T = TypeVar("T") + + _CAN_RECORD_REGISTRY = {} @@ -817,7 +822,7 @@ def del_attribute_from_modules(module: "torch.nn.Module", key: str): del_attribute_from_modules(submodule, key) -def can_return_tuple(func): +def can_return_tuple(func: Callable[P, T]) -> Callable[P, tuple | T]: """ Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or use_return_dict=False is set in the config. @@ -827,12 +832,13 @@ def can_return_tuple(func): """ @wraps(func) - def wrapper(self, *args, **kwargs): + def wrapper(*args: P.args, **kwargs: P.kwargs) -> tuple | T: + self = args[0] return_dict = self.config.return_dict if hasattr(self, "config") else True return_dict_passed = kwargs.pop("return_dict", return_dict) if return_dict_passed is not None: return_dict = return_dict_passed - output = func(self, *args, **kwargs) + output = func(*args, **kwargs) if not return_dict and not isinstance(output, tuple): output = output.to_tuple() return output @@ -859,7 +865,9 @@ class OutputRecorder: class_name: str | None = None -def check_model_inputs(func=None, *, tie_last_hidden_states=True): +def check_model_inputs( + func: Callable[P, T] | None = None, *, tie_last_hidden_states: bool = True +) -> Callable[P, tuple | T]: """ Decorator to intercept specific layer outputs without using hooks. Compatible with torch.compile (Dynamo tracing). @@ -872,9 +880,10 @@ def check_model_inputs(func=None, *, tie_last_hidden_states=True): is needed for some vision models (e.g. CLIP, SigLIP) """ - def wrapped_fn(func): + def wrapped_fn(func: Callable[P, T]) -> Callable[P, tuple | T]: @wraps(func) - def wrapper(self, *args, **kwargs): + def wrapper(*args: P.args, **kwargs: P.kwargs) -> tuple | T: + self, *args = args args_with_config_defaults = [ "use_cache", "vision_feature_layer", From 676206964999bea2a85cb792c13a2bbffb5bcfc7 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 13:37:32 +0100 Subject: [PATCH 02/20] Add check_decorator_return_types.py script to check against 'tuple | X' It verifies that users don't use e.g. `tuple | BaseModelOutputWithPooling` return typings anymore, as they should use `BaseModelOutputWithPooling` instead then. It also makes sure that a typing is used --- .circleci/config.yml | 1 + .github/workflows/pr-repo-consistency-bot.yml | 2 + utils/check_decorator_return_types.py | 351 ++++++++++++++++++ 3 files changed, 354 insertions(+) create mode 100644 utils/check_decorator_return_types.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 7875cdc368f5..c21aab20bd8a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -189,6 +189,7 @@ jobs: - run: python utils/check_config_docstrings.py - run: python utils/check_config_attributes.py - run: python utils/check_doctest_list.py + - run: python utils/check_decorator_return_types.py - run: python utils/update_metadata.py --check-only - run: python utils/add_dates.py --check-only - run: > diff --git a/.github/workflows/pr-repo-consistency-bot.yml b/.github/workflows/pr-repo-consistency-bot.yml index 29046f8281c9..7ed47499377e 100644 --- a/.github/workflows/pr-repo-consistency-bot.yml +++ b/.github/workflows/pr-repo-consistency-bot.yml @@ -165,6 +165,7 @@ jobs: cp utils/check_pipeline_typing.py pr-repo/utils/check_pipeline_typing.py cp utils/check_doctest_list.py pr-repo/utils/check_doctest_list.py cp utils/check_docstrings.py pr-repo/utils/check_docstrings.py + cp utils/check_decorator_return_types.py pr-repo/utils/check_decorator_return_types.py cp utils/add_dates.py pr-repo/utils/add_dates.py # Run commands in PR directory (with the copied trusted scripts) @@ -185,6 +186,7 @@ jobs: python utils/check_pipeline_typing.py --fix_and_overwrite python utils/check_doctest_list.py --fix_and_overwrite python utils/check_docstrings.py --fix_and_overwrite + python utils/check_decorator_return_types.py --fix_and_overwrite python utils/add_dates.py # Check if there are changes diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py new file mode 100644 index 000000000000..2c6cbb3d5ac0 --- /dev/null +++ b/utils/check_decorator_return_types.py @@ -0,0 +1,351 @@ +# Copyright 2026 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AST-based checks for decorators that modify return types. + +This script ensures that functions decorated with `can_return_tuple` or +`check_model_inputs`: + +1. Have an explicit, non-`None` return annotation. +2. Are not annotated with a union that already includes `tuple`. + +The intention is that the decorators themselves are responsible for +adding the `tuple` part of the return type, so the underlying function +should be annotated with just the base return type. + +Usage (from the root of the repo): + +```bash +python utils/check_decorator_return_types.py +``` +""" + +from __future__ import annotations + +import argparse +import ast +import os +from collections.abc import Iterable +from dataclasses import dataclass + + +PATH_TO_TRANSFORMERS = "src/transformers" + + +TARGET_DECORATORS = {"can_return_tuple", "check_model_inputs"} + + +@dataclass +class Violation: + file_path: str + line: int + function_name: str + decorator_name: str + message: str + + def format(self) -> str: + return ( + f"{self.file_path}:{self.line}: function '{self.function_name}' " + f"decorated with '@{self.decorator_name}' {self.message}" + ) + + +def _iter_python_files(root: str) -> Iterable[str]: + for dirpath, _, filenames in os.walk(root): + for filename in filenames: + if filename.endswith(".py"): + yield os.path.join(dirpath, filename) + + +def _decorator_name(node: ast.expr) -> str | None: + """Return the simple name of a decorator, if it matches a target. + + Handles forms like: + - @can_return_tuple + - @utils.can_return_tuple + - @can_return_tuple(...) + - @utils.check_model_inputs(...) + """ + + target = node + if isinstance(target, ast.Call): + target = target.func + + if isinstance(target, ast.Name): + name = target.id + elif isinstance(target, ast.Attribute): + name = target.attr + else: + return None + + if name in TARGET_DECORATORS: + return name + return None + + +def _is_none_annotation(returns: ast.expr | None) -> bool: + if returns is None: + return True + + # -> None + if isinstance(returns, ast.Constant) and returns.value is None: + return True + + # -> None (as a name) + if isinstance(returns, ast.Name) and returns.id == "None": + return True + + return False + + +def _is_tuple_type(node: ast.AST) -> bool: + """Return True if the node represents a tuple type. + + We conservatively treat the following as tuple types: + - `tuple` + - `tuple[...]` + - `Tuple[...]` (from typing) + """ + + if isinstance(node, ast.Name) and node.id in {"tuple", "Tuple"}: + return True + + if isinstance(node, ast.Subscript): + value = node.value + if isinstance(value, ast.Name) and value.id in {"tuple", "Tuple"}: + return True + + return False + + +def _iter_union_members(node: ast.AST) -> Iterable[ast.AST]: + """Yield flattened members of a PEP 604-style union (X | Y | Z). + + For non-union nodes, yields the node itself once. + """ + + if isinstance(node, ast.BinOp) and isinstance(node.op, ast.BitOr): + yield from _iter_union_members(node.left) + yield from _iter_union_members(node.right) + else: + yield node + + +def _has_tuple_in_union(returns: ast.expr) -> bool: + members = list(_iter_union_members(returns)) + if len(members) <= 1: + # Not a union + return False + + return any(_is_tuple_type(member) for member in members) + + +def _is_delegating_to_super(func_node: ast.AST) -> bool: + """Return True if the function body starts with a super(...) delegation. + + We ignore functions whose first non-docstring statement is either: + - `return super(...` (possibly via an attribute like `super().foo(...)`), or + - `super(...` as a bare expression. + """ + + if not isinstance(func_node, (ast.FunctionDef, ast.AsyncFunctionDef)): + return False + + body = getattr(func_node, "body", []) + if not body: + return False + + # Skip an initial docstring expression if present. + first_stmt_idx = 0 + if ( + isinstance(body[0], ast.Expr) + and isinstance(body[0].value, ast.Constant) + and isinstance(body[0].value.value, str) + ): + first_stmt_idx = 1 + + if first_stmt_idx >= len(body): + return False + + first_stmt = body[first_stmt_idx] + if isinstance(first_stmt, ast.Return): + target = first_stmt.value + elif isinstance(first_stmt, ast.Expr): + target = first_stmt.value + else: + return False + + if target is None: + return False + + # Look for a super(...) call anywhere in the expression tree. + for node in ast.walk(target): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "super": + return True + + return False + + +def _collect_decorated_functions(tree: ast.AST, file_path: str) -> list[tuple[ast.AST, str]]: + """Return (function_node, decorator_name) pairs for targeted decorators.""" + + functions: list[tuple[ast.AST, str]] = [] + for node in ast.walk(tree): + if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + continue + if not node.decorator_list: + continue + for deco in node.decorator_list: + name = _decorator_name(deco) + if name is not None: + functions.append((node, name)) + break + return functions + + +def _compute_line_offsets(source: str) -> list[int]: + """Return starting offset in the full string for each line (0-based).""" + + offsets = [0] + total = 0 + for line in source.splitlines(keepends=True): + total += len(line) + offsets.append(total) + return offsets + + +def _make_union_without_tuple(returns: ast.expr) -> str | None: + """Build a new union annotation string without any tuple-type members. + + Returns the new annotation expression as a string, or None if it cannot + be constructed (e.g. all members were tuple types). + """ + + members = [m for m in _iter_union_members(returns) if not _is_tuple_type(m)] + if not members: + return None + + # We rely on Python's built-in unparser (3.9+). + pieces = [ast.unparse(m) for m in members] + return " | ".join(pieces) + + +def check_decorator_return_types(overwrite: bool = False): + all_violations: list[Violation] = [] + unfixable_violations: list[Violation] = [] + + for file_path in _iter_python_files(PATH_TO_TRANSFORMERS): + with open(file_path, "r", encoding="utf-8") as f: + source = f.read() + + try: + tree = ast.parse(source, filename=file_path, type_comments=True) + except SyntaxError as e: + print(f"Skipping {file_path} due to SyntaxError: {e}") + continue + + functions = _collect_decorated_functions(tree, file_path) + if not functions: + continue + + fixes: list[tuple[int, int, str]] = [] # (start, end, new_text) + + for func_node, decorator_name in functions: + # Ignore trivial delegations like `return super(...` or `super(...`. + if _is_delegating_to_super(func_node): + continue + + returns = func_node.returns + + # 1. Must have a non-None return annotation. + if _is_none_annotation(returns): + v = Violation( + file_path=file_path, + line=func_node.lineno, + function_name=func_node.name, + decorator_name=decorator_name, + message="must have a non-None return annotation", + ) + all_violations.append(v) + unfixable_violations.append(v) + continue + + # Nothing else to do without an annotation. + if returns is None: + continue + + # 2. Annotation must not already be a union including `tuple`. + if _has_tuple_in_union(returns): + v = Violation( + file_path=file_path, + line=func_node.lineno, + function_name=func_node.name, + decorator_name=decorator_name, + message="must not be annotated with a union that includes 'tuple'", + ) + all_violations.append(v) + + if not overwrite: + continue + + new_annotation = _make_union_without_tuple(returns) + if new_annotation is None: + unfixable_violations.append(v) + continue + + # Use precise offsets to replace just the annotation. + if not hasattr(returns, "lineno") or not hasattr(returns, "end_lineno"): + unfixable_violations.append(v) + continue + + line_offsets = _compute_line_offsets(source) + try: + start = line_offsets[returns.lineno - 1] + returns.col_offset + end = line_offsets[returns.end_lineno - 1] + returns.end_col_offset + except IndexError: + unfixable_violations.append(v) + continue + + fixes.append((start, end, new_annotation)) + + if overwrite and fixes: + # Apply fixes from the end of the file backwards so offsets stay valid. + fixes.sort(key=lambda x: x[0], reverse=True) + new_source = source + for start, end, text in fixes: + new_source = new_source[:start] + text + new_source[end:] + + if new_source != source: + print(f"Updating return annotations in {file_path} to drop 'tuple' from unions.") + with open(file_path, "w", encoding="utf-8", newline="\n") as f: + f.write(new_source) + + if all_violations and not overwrite: + header = "Found decorator return-type violations:\n\n" + body = "\n".join(v.format() for v in all_violations) + footer = "\n\nRun this script with --fix_and_overwrite to auto-fix some violations." + raise ValueError(header + body + footer) + + if overwrite and unfixable_violations: + header = "Found decorator return-type violations that could not be auto-fixed:\n\n" + body = "\n".join(v.format() for v in unfixable_violations) + footer = "\n\nPlease fix these annotations manually." + raise ValueError(header + body + footer) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.") + args = parser.parse_args() + + check_decorator_return_types(args.fix_and_overwrite) From fa9caf7cdcf0fc3c0a421a24b04f081f06fd0dde Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 13:38:36 +0100 Subject: [PATCH 03/20] Run checking script & fix existing typings --- .../models/afmoe/modeling_afmoe.py | 2 +- .../models/afmoe/modular_afmoe.py | 2 +- .../models/aimv2/modeling_aimv2.py | 4 ++-- .../models/albert/modeling_albert.py | 14 ++++++------- .../models/align/modeling_align.py | 12 +++++------ .../models/altclip/modeling_altclip.py | 12 +++++------ src/transformers/models/aria/modeling_aria.py | 6 +++--- src/transformers/models/aria/modular_aria.py | 6 +++--- .../audioflamingo3/modeling_audioflamingo3.py | 4 ++-- .../audioflamingo3/modular_audioflamingo3.py | 4 ++-- .../models/aya_vision/modeling_aya_vision.py | 6 +++--- .../models/aya_vision/modular_aya_vision.py | 6 +++--- src/transformers/models/bert/modeling_bert.py | 18 ++++++++--------- .../modeling_bert_generation.py | 4 ++-- src/transformers/models/blip/modeling_blip.py | 14 ++++++------- .../models/blip_2/modeling_blip_2.py | 20 +++++++++---------- .../bridgetower/modeling_bridgetower.py | 2 +- src/transformers/models/bros/modeling_bros.py | 10 +++++----- .../models/camembert/modeling_camembert.py | 14 ++++++------- .../models/camembert/modular_camembert.py | 12 +++++------ .../models/chameleon/modeling_chameleon.py | 4 ++-- .../chinese_clip/modeling_chinese_clip.py | 14 ++++++------- src/transformers/models/clap/modeling_clap.py | 14 ++++++------- src/transformers/models/clip/modeling_clip.py | 4 ++-- .../models/clipseg/modeling_clipseg.py | 6 +++--- src/transformers/models/clvp/modeling_clvp.py | 2 +- .../cohere2_vision/modeling_cohere2_vision.py | 6 +++--- .../cohere2_vision/modular_cohere2_vision.py | 6 +++--- src/transformers/models/csm/modeling_csm.py | 6 +++--- src/transformers/models/csm/modular_csm.py | 6 +++--- .../models/data2vec/modeling_data2vec_text.py | 14 ++++++------- .../models/data2vec/modular_data2vec_text.py | 12 +++++------ .../deepseek_vl/modeling_deepseek_vl.py | 2 +- .../modeling_deepseek_vl_hybrid.py | 2 +- .../modular_deepseek_vl_hybrid.py | 2 +- src/transformers/models/dia/modeling_dia.py | 8 ++++---- src/transformers/models/dia/modular_dia.py | 8 ++++---- .../models/distilbert/modeling_distilbert.py | 12 +++++------ .../models/edgetam/modeling_edgetam.py | 4 ++-- .../models/edgetam/modular_edgetam.py | 2 +- .../edgetam_video/modeling_edgetam_video.py | 2 +- .../models/electra/modeling_electra.py | 16 +++++++-------- src/transformers/models/emu3/modeling_emu3.py | 6 +++--- src/transformers/models/emu3/modular_emu3.py | 6 +++--- .../modeling_encoder_decoder.py | 2 +- .../models/ernie/modeling_ernie.py | 18 ++++++++--------- .../models/ernie/modular_ernie.py | 18 ++++++++--------- .../modeling_ernie4_5_vl_moe.py | 10 +++++----- .../modular_ernie4_5_vl_moe.py | 10 +++++----- src/transformers/models/esm/modeling_esm.py | 10 +++++----- .../models/evolla/modeling_evolla.py | 12 ++++++----- .../models/evolla/modular_evolla.py | 10 ++++++---- .../models/exaone4/modeling_exaone4.py | 2 +- .../models/exaone4/modular_exaone4.py | 2 +- .../models/falcon_h1/modeling_falcon_h1.py | 4 ++-- .../models/falcon_h1/modular_falcon_h1.py | 4 ++-- .../models/fast_vlm/modeling_fast_vlm.py | 6 +++--- .../models/fast_vlm/modular_fast_vlm.py | 6 +++--- .../models/flava/modeling_flava.py | 4 ++-- .../models/florence2/modeling_florence2.py | 10 ++++------ .../models/florence2/modular_florence2.py | 10 ++++------ src/transformers/models/fuyu/modeling_fuyu.py | 4 ++-- .../models/gemma3/modeling_gemma3.py | 6 +++--- .../models/gemma3/modular_gemma3.py | 6 +++--- .../models/gemma3n/modeling_gemma3n.py | 6 +++--- .../models/gemma3n/modular_gemma3n.py | 6 +++--- src/transformers/models/git/modeling_git.py | 2 +- src/transformers/models/glm4/modeling_glm4.py | 2 +- src/transformers/models/glm4/modular_glm4.py | 2 +- .../models/glm46v/modeling_glm46v.py | 8 ++++---- .../models/glm4v/modeling_glm4v.py | 12 +++++------ .../models/glm4v/modular_glm4v.py | 10 +++++----- .../models/glm4v_moe/modeling_glm4v_moe.py | 12 +++++------ .../models/glm4v_moe/modular_glm4v_moe.py | 4 ++-- .../models/glm_image/modeling_glm_image.py | 8 ++++---- .../models/glm_image/modular_glm_image.py | 6 +++--- .../models/glmasr/modeling_glmasr.py | 4 ++-- .../models/glmasr/modular_glmasr.py | 4 ++-- .../models/got_ocr2/modeling_got_ocr2.py | 8 ++++---- .../models/got_ocr2/modular_got_ocr2.py | 6 +++--- .../gpt_bigcode/modeling_gpt_bigcode.py | 2 +- .../models/gpt_neox/modeling_gpt_neox.py | 2 +- .../models/gpt_neox/modular_gpt_neox.py | 2 +- .../granite_speech/modeling_granite_speech.py | 6 ++---- .../models/granitemoe/modeling_granitemoe.py | 2 +- .../models/granitemoe/modular_granitemoe.py | 2 +- .../modeling_granitemoehybrid.py | 4 ++-- .../modular_granitemoehybrid.py | 2 +- .../modeling_granitemoeshared.py | 2 +- .../models/groupvit/modeling_groupvit.py | 4 ++-- .../models/idefics/modeling_idefics.py | 4 ++-- src/transformers/models/idefics/vision.py | 2 +- .../models/idefics2/modeling_idefics2.py | 8 ++++---- .../models/idefics3/modeling_idefics3.py | 8 ++++---- .../instructblip/modeling_instructblip.py | 12 +++++------ .../modeling_instructblipvideo.py | 12 +++++------ .../modular_instructblipvideo.py | 6 +++--- .../models/internvl/modeling_internvl.py | 8 ++++---- .../models/internvl/modular_internvl.py | 6 +++--- .../models/janus/modeling_janus.py | 4 ++-- .../models/janus/modular_janus.py | 4 ++-- .../models/kosmos2/modeling_kosmos2.py | 12 +++++------ .../models/layoutlm/modeling_layoutlm.py | 12 +++++------ .../models/lfm2_vl/modeling_lfm2_vl.py | 6 +++--- .../models/lfm2_vl/modular_lfm2_vl.py | 6 +++--- .../lighton_ocr/modeling_lighton_ocr.py | 6 +++--- .../models/lighton_ocr/modular_lighton_ocr.py | 4 ++-- .../models/llama4/modeling_llama4.py | 8 ++++---- .../models/llava/modeling_llava.py | 6 +++--- .../models/llava_next/modeling_llava_next.py | 6 +++--- .../modeling_llava_next_video.py | 8 ++++---- .../modular_llava_next_video.py | 8 ++++---- .../modeling_llava_onevision.py | 8 ++++---- .../modular_llava_onevision.py | 8 ++++---- .../models/markuplm/modeling_markuplm.py | 10 +++++----- .../models/metaclip_2/modeling_metaclip_2.py | 4 ++-- .../models/metaclip_2/modular_metaclip_2.py | 4 ++-- .../models/minimax/modeling_minimax.py | 2 +- .../models/minimax/modular_minimax.py | 2 +- .../models/mistral3/modeling_mistral3.py | 6 +++--- .../models/mistral3/modular_mistral3.py | 6 +++--- src/transformers/models/mlcd/modeling_mlcd.py | 2 +- src/transformers/models/mlcd/modular_mlcd.py | 2 +- .../models/mllama/modeling_mllama.py | 4 ++-- .../models/mobilebert/modeling_mobilebert.py | 16 +++++++-------- .../modeling_modernbert_decoder.py | 6 +++--- .../modular_modernbert_decoder.py | 6 +++--- .../models/moonshine/modeling_moonshine.py | 4 ++-- .../models/moonshine/modular_moonshine.py | 4 ++-- .../models/nllb_moe/modeling_nllb_moe.py | 8 ++++---- src/transformers/models/opt/modeling_opt.py | 6 +++--- .../models/ovis2/modeling_ovis2.py | 10 +++++----- .../models/ovis2/modular_ovis2.py | 10 +++++----- .../models/owlv2/modeling_owlv2.py | 4 ++-- .../models/owlvit/modeling_owlvit.py | 4 ++-- .../paddleocr_vl/modeling_paddleocr_vl.py | 8 ++++---- .../paddleocr_vl/modular_paddleocr_vl.py | 8 ++++---- .../models/paligemma/modeling_paligemma.py | 6 +++--- .../models/pe_audio/modeling_pe_audio.py | 2 +- .../models/pe_audio/modular_pe_audio.py | 2 +- .../pe_audio_video/modeling_pe_audio_video.py | 2 +- .../pe_audio_video/modular_pe_audio_video.py | 2 +- .../models/pe_video/modeling_pe_video.py | 6 +++--- .../models/pe_video/modular_pe_video.py | 6 +++--- .../perception_lm/modeling_perception_lm.py | 6 +++--- .../perception_lm/modular_perception_lm.py | 6 +++--- .../modeling_phi4_multimodal.py | 4 ++-- .../modular_phi4_multimodal.py | 4 ++-- .../models/pixtral/modeling_pixtral.py | 2 +- .../qwen2_5_omni/modeling_qwen2_5_omni.py | 12 ++++++----- .../qwen2_5_omni/modular_qwen2_5_omni.py | 12 ++++++----- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 8 ++++---- .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 4 ++-- .../models/qwen2_vl/modeling_qwen2_vl.py | 6 +++--- .../qwen3_omni_moe/modeling_qwen3_omni_moe.py | 16 +++++++-------- .../qwen3_omni_moe/modular_qwen3_omni_moe.py | 4 ++-- .../models/qwen3_vl/modeling_qwen3_vl.py | 12 +++++------ .../models/qwen3_vl/modular_qwen3_vl.py | 12 +++++------ .../qwen3_vl_moe/modeling_qwen3_vl_moe.py | 12 +++++------ .../qwen3_vl_moe/modular_qwen3_vl_moe.py | 2 +- .../models/roberta/modeling_roberta.py | 14 ++++++------- .../models/roberta/modular_roberta.py | 12 +++++------ .../modeling_roberta_prelayernorm.py | 14 ++++++------- .../models/roc_bert/modeling_roc_bert.py | 16 +++++++-------- src/transformers/models/sam/modeling_sam.py | 2 +- src/transformers/models/sam2/modeling_sam2.py | 6 +++--- src/transformers/models/sam2/modular_sam2.py | 6 +++--- .../models/sam2_video/modeling_sam2_video.py | 2 +- src/transformers/models/sam3/modeling_sam3.py | 10 +++++----- .../sam3_tracker/modeling_sam3_tracker.py | 2 +- .../modeling_sam3_tracker_video.py | 2 +- .../modular_sam3_tracker_video.py | 2 +- .../models/sam_hq/modeling_sam_hq.py | 2 +- .../models/sam_hq/modular_sam_hq.py | 2 +- .../models/siglip/modeling_siglip.py | 4 ++-- .../models/siglip2/modeling_siglip2.py | 4 ++-- .../models/siglip2/modular_siglip2.py | 2 +- .../models/smolvlm/modeling_smolvlm.py | 8 ++++---- .../models/smolvlm/modular_smolvlm.py | 4 ++-- .../models/splinter/modeling_splinter.py | 4 ++-- .../models/starcoder2/modeling_starcoder2.py | 2 +- .../models/starcoder2/modular_starcoder2.py | 2 +- .../modeling_switch_transformers.py | 8 ++++---- .../modular_switch_transformers.py | 8 ++++---- .../models/t5gemma/modeling_t5gemma.py | 6 +++--- .../models/t5gemma/modular_t5gemma.py | 6 +++--- .../models/t5gemma2/modeling_t5gemma2.py | 4 ++-- .../models/t5gemma2/modular_t5gemma2.py | 4 ++-- .../video_llama_3/modeling_video_llama_3.py | 12 +++++------ .../video_llama_3/modular_video_llama_3.py | 12 +++++------ .../video_llava/modeling_video_llava.py | 8 ++++---- .../models/vipllava/modeling_vipllava.py | 4 ++-- .../models/vipllava/modular_vipllava.py | 4 ++-- .../modeling_vision_text_dual_encoder.py | 4 ++-- .../modeling_vitpose_backbone.py | 2 +- .../models/vjepa2/modeling_vjepa2.py | 2 +- .../models/voxtral/modeling_voxtral.py | 4 ++-- .../models/voxtral/modular_voxtral.py | 4 ++-- .../models/x_clip/modeling_x_clip.py | 6 +++--- .../xlm_roberta/modeling_xlm_roberta.py | 14 ++++++------- .../models/xlm_roberta/modular_xlm_roberta.py | 12 +++++------ .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 14 ++++++------- .../xlm_roberta_xl/modular_xlm_roberta_xl.py | 12 +++++------ .../models/xlstm/modeling_xlstm.py | 4 ++-- src/transformers/models/xmod/modeling_xmod.py | 14 ++++++------- 205 files changed, 683 insertions(+), 681 deletions(-) diff --git a/src/transformers/models/afmoe/modeling_afmoe.py b/src/transformers/models/afmoe/modeling_afmoe.py index 15e88fc1f00b..1a311011d4a8 100644 --- a/src/transformers/models/afmoe/modeling_afmoe.py +++ b/src/transformers/models/afmoe/modeling_afmoe.py @@ -571,7 +571,7 @@ def forward( cache_position: torch.LongTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/afmoe/modular_afmoe.py b/src/transformers/models/afmoe/modular_afmoe.py index d81a659e905b..d3e9b5ffcce1 100644 --- a/src/transformers/models/afmoe/modular_afmoe.py +++ b/src/transformers/models/afmoe/modular_afmoe.py @@ -392,7 +392,7 @@ def forward( cache_position: torch.LongTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py index 1000765e53a8..b8dd43481756 100644 --- a/src/transformers/models/aimv2/modeling_aimv2.py +++ b/src/transformers/models/aimv2/modeling_aimv2.py @@ -611,7 +611,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -646,7 +646,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 70c179cfd6a3..608a77cf6fbc 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -391,7 +391,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> BaseModelOutputWithPooling | tuple: + ) -> BaseModelOutputWithPooling: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -465,7 +465,7 @@ def forward( labels: torch.LongTensor | None = None, sentence_order_label: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> AlbertForPreTrainingOutput | tuple: + ) -> AlbertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -594,7 +594,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> MaskedLMOutput | tuple: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -686,7 +686,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> SequenceClassifierOutput | tuple: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -768,7 +768,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> TokenClassifierOutput | tuple: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -825,7 +825,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> AlbertForPreTrainingOutput | tuple: + ) -> AlbertForPreTrainingOutput: outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, @@ -892,7 +892,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> AlbertForPreTrainingOutput | tuple: + ) -> AlbertForPreTrainingOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 98d14fbb307c..91e0d71b9ab7 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -772,7 +772,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -898,7 +898,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1011,7 +1011,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: + ) -> BaseModelOutputWithPoolingAndNoAttention: r""" Examples: @@ -1103,7 +1103,7 @@ def get_text_features( position_ids: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1136,7 +1136,7 @@ def get_text_features( @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1172,7 +1172,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | AlignOutput: + ) -> AlignOutput: r""" return_loss (`bool`, *optional*): Whether or not to return the contrastive loss. diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 13c94fa88362..62ffd1cc8154 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -385,7 +385,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -617,7 +617,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -842,7 +842,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, interpolate_pos_encoding: bool | None = False, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1069,7 +1069,7 @@ def forward( return_dict: bool | None = None, output_hidden_states: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPoolingAndProjection: + ) -> BaseModelOutputWithPoolingAndProjection: r""" Examples: @@ -1164,7 +1164,7 @@ def get_text_features( position_ids: torch.Tensor | None = None, token_type_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1199,7 +1199,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 91aecac948b7..1b23b30f9125 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -926,7 +926,7 @@ def get_image_features( vision_feature_layer: int = -1, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: patch_attention_mask = self._create_patch_attention_mask(pixel_mask) image_outputs = self.vision_tower( pixel_values, @@ -983,7 +983,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | AriaModelOutputWithPast: + ) -> AriaModelOutputWithPast: if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) @@ -1099,7 +1099,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AriaCausalLMOutputWithPast: + ) -> AriaCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index cd84941a9c34..708187ff3891 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -1271,7 +1271,7 @@ def get_image_features( vision_feature_layer: int = -1, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: patch_attention_mask = self._create_patch_attention_mask(pixel_mask) image_outputs = self.vision_tower( pixel_values, @@ -1304,7 +1304,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | AriaModelOutputWithPast: + ) -> AriaModelOutputWithPast: if inputs_embeds is None: inputs_embeds = self.get_input_embeddings()(input_ids) @@ -1383,7 +1383,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AriaCausalLMOutputWithPast: + ) -> AriaCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py index f88a19796f34..6794248d12a9 100644 --- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py @@ -327,7 +327,7 @@ def forward( input_features: torch.Tensor, input_features_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Args: input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): @@ -455,7 +455,7 @@ def get_audio_features( input_features: torch.FloatTensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py index b846957940cc..2999993f01e6 100644 --- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py @@ -68,7 +68,7 @@ def forward( input_features: torch.Tensor, input_features_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Args: input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): @@ -156,7 +156,7 @@ def get_audio_features( input_features: torch.FloatTensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py index 91517071cb17..b2d9454cafe1 100644 --- a/src/transformers/models/aya_vision/modeling_aya_vision.py +++ b/src/transformers/models/aya_vision/modeling_aya_vision.py @@ -190,7 +190,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -256,7 +256,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AyaVisionModelOutputWithPast: + ) -> AyaVisionModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -356,7 +356,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AyaVisionCausalLMOutputWithPast: + ) -> AyaVisionCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/aya_vision/modular_aya_vision.py b/src/transformers/models/aya_vision/modular_aya_vision.py index 378a826e1f2e..e5880791638c 100644 --- a/src/transformers/models/aya_vision/modular_aya_vision.py +++ b/src/transformers/models/aya_vision/modular_aya_vision.py @@ -115,7 +115,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -157,7 +157,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AyaVisionModelOutputWithPast: + ) -> AyaVisionModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -212,7 +212,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AyaVisionCausalLMOutputWithPast: + ) -> AyaVisionCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 1f3d08a5ce31..ca0f85f73bf6 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -649,7 +649,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -788,7 +788,7 @@ def forward( labels: torch.Tensor | None = None, next_sentence_label: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BertForPreTrainingOutput: + ) -> BertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -893,7 +893,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -979,7 +979,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1064,7 +1064,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | NextSentencePredictorOutput: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -1152,7 +1152,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1231,7 +1231,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1330,7 +1330,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1387,7 +1387,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.bert( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index bccb17124aa9..b3a5f344379a 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -518,7 +518,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -673,7 +673,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index e79ccdfa7d5e..919c938bdf12 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -486,7 +486,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -570,7 +570,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -602,7 +602,7 @@ def get_image_features( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -693,7 +693,7 @@ def forward( return_loss: bool | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BlipOutput: + ) -> BlipOutput: r""" return_loss (`bool`, *optional*): Whether or not to return the contrastive loss. @@ -809,7 +809,7 @@ def forward( interpolate_pos_encoding: bool = False, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BlipForConditionalGenerationModelOutput: + ) -> BlipForConditionalGenerationModelOutput: r""" Examples: @@ -980,7 +980,7 @@ def forward( labels: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BlipTextVisionModelOutput: + ) -> BlipTextVisionModelOutput: r""" Examples: @@ -1208,7 +1208,7 @@ def forward( attention_mask: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BlipTextVisionModelOutput: + ) -> BlipTextVisionModelOutput: r""" use_itm_head (`bool`, *optional*, defaults to `True`): Whether or not to use the image-text matching head. diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 0ec86489224e..cf5beb05d93b 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -507,7 +507,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -807,7 +807,7 @@ def forward( encoder_attention_mask=None, query_length=0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithPastAndCrossAttentions: for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] @@ -963,7 +963,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" query_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Hidden states to be used in the attention computation. If cross-attention, @@ -1091,7 +1091,7 @@ def get_text_features( decoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -1150,7 +1150,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: ```python @@ -1251,7 +1251,7 @@ def forward( labels: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Blip2ForConditionalGenerationModelOutput: + ) -> Blip2ForConditionalGenerationModelOutput: r""" decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also @@ -1396,7 +1396,7 @@ def forward( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Blip2TextModelOutput: + ) -> Blip2TextModelOutput: r""" Examples: @@ -1478,7 +1478,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Blip2VisionModelOutput: + ) -> Blip2VisionModelOutput: r""" Examples: @@ -1622,7 +1622,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisionQformerOutputs: + ) -> BaseModelOutputWithVisionQformerOutputs: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1696,7 +1696,7 @@ def forward( labels: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Blip2ForConditionalGenerationModelOutput: + ) -> Blip2ForConditionalGenerationModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 15553adbf315..2c0ce5cac266 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -1029,7 +1029,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py index 9b840fdcd5ab..e2cce8060118 100755 --- a/src/transformers/models/bros/modeling_bros.py +++ b/src/transformers/models/bros/modeling_bros.py @@ -423,7 +423,7 @@ def forward( output_attentions: bool | None = False, output_hidden_states: bool | None = False, return_dict: bool | None = True, - ) -> tuple[torch.Tensor] | BaseModelOutputWithCrossAttentions: + ) -> BaseModelOutputWithCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -570,7 +570,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values @@ -709,7 +709,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values @@ -830,7 +830,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BrosSpadeOutput: + ) -> BrosSpadeOutput: r""" bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values @@ -967,7 +967,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index 7bdcb24be61e..528c75128033 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -628,7 +628,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -768,7 +768,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -864,7 +864,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -948,7 +948,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1049,7 +1049,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1117,7 +1117,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1218,7 +1218,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/camembert/modular_camembert.py b/src/transformers/models/camembert/modular_camembert.py index a7d98b334983..3e903ef9652d 100644 --- a/src/transformers/models/camembert/modular_camembert.py +++ b/src/transformers/models/camembert/modular_camembert.py @@ -74,7 +74,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -137,7 +137,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -216,7 +216,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -308,7 +308,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -371,7 +371,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -450,7 +450,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index d4ec10c50bdc..48e8c431450e 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -893,7 +893,7 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor): ) def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1081,7 +1081,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index e3e45c96d5d5..bba93ac10d96 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -632,7 +632,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -683,7 +683,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -750,7 +750,7 @@ def forward( output_hidden_states: bool | None = None, interpolate_pos_encoding: bool = False, return_dict: bool | None = None, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -843,7 +843,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1010,7 +1010,7 @@ def get_text_features( token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1046,7 +1046,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1093,7 +1093,7 @@ def forward( interpolate_pos_encoding: bool = False, return_dict: bool | None = None, **kwargs, - ) -> tuple | ChineseCLIPOutput: + ) -> ChineseCLIPOutput: r""" return_loss (`bool`, *optional*): Whether or not to return the contrastive loss. diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 64223c23e8c1..b799bb92c75e 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1260,7 +1260,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -1458,7 +1458,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1561,7 +1561,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1596,7 +1596,7 @@ def get_audio_features( is_longer: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance @@ -1638,7 +1638,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | ClapOutput: + ) -> ClapOutput: r""" is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance @@ -1752,7 +1752,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | ClapTextModelOutput: + ) -> ClapTextModelOutput: r""" Examples: @@ -1816,7 +1816,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | ClapAudioModelOutput: + ) -> ClapAudioModelOutput: r""" is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*): Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 54e02843c2de..6754fb97f9c2 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -789,7 +789,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -824,7 +824,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 818ebab02f88..b788b45a276f 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -499,7 +499,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -859,7 +859,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -893,7 +893,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index 1a9fcccea86d..f34a3781898e 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -1505,7 +1505,7 @@ def get_text_features( text_encoder_inputs_embeds: torch.FloatTensor | None = None, attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | ClvpEncoderOutput: + ) -> ClvpEncoderOutput: r""" text_encoder_inputs_embeds (`torch.FloatTensor`, *optional*): inputs_embeds for the text encoder model passed in place of `input_ids`. diff --git a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py index 0123d7064a41..bd53ebd60030 100644 --- a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py @@ -171,7 +171,7 @@ def set_input_embeddings(self, value): ) def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) selected_image_feature = image_outputs.last_hidden_state image_outputs.pooler_output = self.multi_modal_projector(selected_image_feature) @@ -215,7 +215,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Cohere2VisionModelOutputWithPast: + ) -> Cohere2VisionModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -295,7 +295,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Cohere2VisionCausalLMOutputWithPast: + ) -> Cohere2VisionCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py index ffe43efe3b80..f3618cfacf4d 100644 --- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py @@ -102,7 +102,7 @@ class Cohere2VisionModel(AyaVisionModel): ) def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) selected_image_feature = image_outputs.last_hidden_state image_outputs.pooler_output = self.multi_modal_projector(selected_image_feature) @@ -122,7 +122,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Cohere2VisionModelOutputWithPast: + ) -> Cohere2VisionModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -181,7 +181,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Cohere2VisionCausalLMOutputWithPast: + ) -> Cohere2VisionCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py index 77399420a9e9..50f1c7d2073d 100644 --- a/src/transformers/models/csm/modeling_csm.py +++ b/src/transformers/models/csm/modeling_csm.py @@ -455,7 +455,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*): The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model) @@ -588,7 +588,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*): The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model) @@ -961,7 +961,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CsmOutputWithPast: + ) -> CsmOutputWithPast: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`): 1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input diff --git a/src/transformers/models/csm/modular_csm.py b/src/transformers/models/csm/modular_csm.py index c19deb1d9f58..89f26a9e1f76 100644 --- a/src/transformers/models/csm/modular_csm.py +++ b/src/transformers/models/csm/modular_csm.py @@ -173,7 +173,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*): The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model) @@ -325,7 +325,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*): The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model) @@ -609,7 +609,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CsmOutputWithPast: + ) -> CsmOutputWithPast: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`): 1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 93177572caea..071b20fc0dc3 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -591,7 +591,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -780,7 +780,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -881,7 +881,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -946,7 +946,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1021,7 +1021,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1121,7 +1121,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1180,7 +1180,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.data2vec_text( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/data2vec/modular_data2vec_text.py b/src/transformers/models/data2vec/modular_data2vec_text.py index ac77a81841d9..eca25868b4a6 100644 --- a/src/transformers/models/data2vec/modular_data2vec_text.py +++ b/src/transformers/models/data2vec/modular_data2vec_text.py @@ -145,7 +145,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -246,7 +246,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -311,7 +311,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -386,7 +386,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -486,7 +486,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -545,7 +545,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.data2vec_text( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py index 0113c1e5235c..7ef6e2f4a6e6 100644 --- a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py @@ -157,7 +157,7 @@ def set_input_embeddings(self, value): @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_model(pixel_values, return_dict=True, **kwargs) vision_outputs.pooler_output = self.aligner(vision_outputs.last_hidden_state) diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py index 15f653d7d5c0..cb4d8a7381c3 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py @@ -299,7 +299,7 @@ def get_image_features( pixel_values: torch.FloatTensor, high_res_pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithHighResVisionEncodings: + ) -> BaseModelOutputWithHighResVisionEncodings: low_res_outputs = self.get_low_res_image_features(pixel_values, **kwargs) high_res_outputs = self.get_high_res_image_features(high_res_pixel_values, **kwargs) image_features = self.aligner(low_res_outputs.last_hidden_state, high_res_outputs.last_hidden_state) diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index 8488833e6aaf..d0f2e06efc12 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -314,7 +314,7 @@ def get_image_features( pixel_values: torch.FloatTensor, high_res_pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithHighResVisionEncodings: + ) -> BaseModelOutputWithHighResVisionEncodings: low_res_outputs = self.get_low_res_image_features(pixel_values, **kwargs) high_res_outputs = self.get_high_res_image_features(high_res_pixel_values, **kwargs) image_features = self.aligner(low_res_outputs.last_hidden_state, high_res_outputs.last_hidden_state) diff --git a/src/transformers/models/dia/modeling_dia.py b/src/transformers/models/dia/modeling_dia.py index b2b7b34dd923..06e56c6b0e3f 100644 --- a/src/transformers/models/dia/modeling_dia.py +++ b/src/transformers/models/dia/modeling_dia.py @@ -467,7 +467,7 @@ def forward( output_attentions: bool | None = False, output_hidden_states: bool | None = False, **kwargs: Unpack[FlashAttentionKwargs], - ) -> BaseModelOutput | tuple: + ) -> BaseModelOutput: hidden_states = self.embedding(input_ids) # RoPE @@ -600,7 +600,7 @@ def forward( output_hidden_states: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> BaseModelOutputWithPastAndCrossAttentions | tuple: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`): The original `decoder_input_ids` in 3D shape to facilitate more efficient computations. @@ -712,7 +712,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Seq2SeqModelOutput: + ) -> Seq2SeqModelOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) or (batch_size, target_sequence_length, num_codebooks)`, *optional*): @@ -849,7 +849,7 @@ def forward( labels: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) or (batch_size, target_sequence_length, num_codebooks)`, *optional*): diff --git a/src/transformers/models/dia/modular_dia.py b/src/transformers/models/dia/modular_dia.py index d265bea97c51..c1c905d0708c 100644 --- a/src/transformers/models/dia/modular_dia.py +++ b/src/transformers/models/dia/modular_dia.py @@ -257,7 +257,7 @@ def forward( output_attentions: bool | None = False, output_hidden_states: bool | None = False, **kwargs: Unpack[FlashAttentionKwargs], - ) -> BaseModelOutput | tuple: + ) -> BaseModelOutput: hidden_states = self.embedding(input_ids) # RoPE @@ -390,7 +390,7 @@ def forward( output_hidden_states: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> BaseModelOutputWithPastAndCrossAttentions | tuple: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`): The original `decoder_input_ids` in 3D shape to facilitate more efficient computations. @@ -502,7 +502,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Seq2SeqModelOutput: + ) -> Seq2SeqModelOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) or (batch_size, target_sequence_length, num_codebooks)`, *optional*): @@ -639,7 +639,7 @@ def forward( labels: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length) or (batch_size, target_sequence_length, num_codebooks)`, *optional*): diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index bd8c75b2a855..8beb18db62ef 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -390,7 +390,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> BaseModelOutput | tuple[torch.Tensor, ...]: + ) -> BaseModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): Indices of input sequence tokens in the vocabulary. @@ -481,7 +481,7 @@ def forward( labels: torch.LongTensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> MaskedLMOutput | tuple[torch.Tensor, ...]: + ) -> MaskedLMOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): Indices of input sequence tokens in the vocabulary. @@ -575,7 +575,7 @@ def forward( labels: torch.LongTensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> SequenceClassifierOutput | tuple[torch.Tensor, ...]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -674,7 +674,7 @@ def forward( end_positions: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> QuestionAnsweringModelOutput | tuple[torch.Tensor, ...]: + ) -> QuestionAnsweringModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`): Indices of input sequence tokens in the vocabulary. @@ -773,7 +773,7 @@ def forward( labels: torch.LongTensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> TokenClassifierOutput | tuple[torch.Tensor, ...]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -848,7 +848,7 @@ def forward( labels: torch.LongTensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> MultipleChoiceModelOutput | tuple[torch.Tensor, ...]: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. diff --git a/src/transformers/models/edgetam/modeling_edgetam.py b/src/transformers/models/edgetam/modeling_edgetam.py index 34c59b50a8d4..bf5293911fdb 100644 --- a/src/transformers/models/edgetam/modeling_edgetam.py +++ b/src/transformers/models/edgetam/modeling_edgetam.py @@ -449,7 +449,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | EdgeTamVisionEncoderOutput: + ) -> EdgeTamVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1204,7 +1204,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | EdgeTamVisionEncoderOutput: + ) -> EdgeTamVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/edgetam/modular_edgetam.py b/src/transformers/models/edgetam/modular_edgetam.py index 7113d1a8dadc..c00b74c9c83c 100644 --- a/src/transformers/models/edgetam/modular_edgetam.py +++ b/src/transformers/models/edgetam/modular_edgetam.py @@ -195,7 +195,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | EdgeTamVisionEncoderOutput: + ) -> EdgeTamVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/edgetam_video/modeling_edgetam_video.py b/src/transformers/models/edgetam_video/modeling_edgetam_video.py index ef8623eb3bfe..e52d67cfa4fb 100644 --- a/src/transformers/models/edgetam_video/modeling_edgetam_video.py +++ b/src/transformers/models/edgetam_video/modeling_edgetam_video.py @@ -2235,7 +2235,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | EdgeTamVideoVisionEncoderOutput: + ) -> EdgeTamVideoVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 2fe4c2bea9b9..6a6b8168c692 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -594,7 +594,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithCrossAttentions: + ) -> BaseModelOutputWithCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -844,7 +844,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -922,7 +922,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | ElectraForPreTrainingOutput: + ) -> ElectraForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring) @@ -1024,7 +1024,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1091,7 +1091,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1150,7 +1150,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: discriminator_hidden_states = self.electra( input_ids, attention_mask=attention_mask, @@ -1217,7 +1217,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1332,7 +1332,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 2d41f4aead43..96f92f5f4ae2 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1403,7 +1403,7 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.L ) def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Emu3VQVAEModelOutput: + ) -> Emu3VQVAEModelOutput: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images. @@ -1482,7 +1482,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`): The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using @@ -1563,7 +1563,7 @@ def forward( labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`): The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py index ac4c79ae3cf2..7a75d4636267 100644 --- a/src/transformers/models/emu3/modular_emu3.py +++ b/src/transformers/models/emu3/modular_emu3.py @@ -957,7 +957,7 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.L ) def get_image_features( self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Emu3VQVAEModelOutput: + ) -> Emu3VQVAEModelOutput: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images. @@ -1036,7 +1036,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`): The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using @@ -1117,7 +1117,7 @@ def forward( labels: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`): The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index 2df8cf56eb05..3f6f5895ff08 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -329,7 +329,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs, - ) -> tuple | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index e300858722e5..2693da468327 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -612,7 +612,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -808,7 +808,7 @@ def forward( labels: torch.Tensor | None = None, next_sentence_label: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | ErnieForPreTrainingOutput: + ) -> ErnieForPreTrainingOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -930,7 +930,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1023,7 +1023,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1125,7 +1125,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | NextSentencePredictorOutput: + ) -> NextSentencePredictorOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1220,7 +1220,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1306,7 +1306,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1412,7 +1412,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -1476,7 +1476,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as diff --git a/src/transformers/models/ernie/modular_ernie.py b/src/transformers/models/ernie/modular_ernie.py index de913662c91d..8f062cfa6c73 100644 --- a/src/transformers/models/ernie/modular_ernie.py +++ b/src/transformers/models/ernie/modular_ernie.py @@ -205,7 +205,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -347,7 +347,7 @@ def forward( labels: torch.Tensor | None = None, next_sentence_label: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | ErnieForPreTrainingOutput: + ) -> ErnieForPreTrainingOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -430,7 +430,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -500,7 +500,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -554,7 +554,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | NextSentencePredictorOutput: + ) -> NextSentencePredictorOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -628,7 +628,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -700,7 +700,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -791,7 +791,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as @@ -844,7 +844,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Task type embedding is a special embedding to represent the characteristic of different tasks, such as diff --git a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py index 3b02f84c8d84..d9d2f83f963e 100644 --- a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py @@ -936,7 +936,7 @@ def rot_pos_emb(self, grid_thw): @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" grid_thw (`torch.LongTensor` of shape `(num_images, 3)`): The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values. @@ -1273,7 +1273,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1298,7 +1298,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1372,7 +1372,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*): Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2). @@ -1663,7 +1663,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*): Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2). diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py index 1093d0d9edc9..8b32a5d83877 100644 --- a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +++ b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py @@ -891,7 +891,7 @@ def get_device(self): @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: hidden_states = self.patch_embed(hidden_states) rotary_pos_emb = self.rot_pos_emb(grid_thw) emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) @@ -1275,7 +1275,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: video_outputs = self.vision_tower(pixel_values_videos, video_grid_thw, return_dict=True, **kwargs) video_embeds = self.resampler_model(video_outputs.last_hidden_state, video_grid_thw) split_sizes = ( @@ -1294,7 +1294,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, image_grid_thw, return_dict=True, **kwargs) image_embeds = self.resampler_model(image_outputs.last_hidden_state, image_grid_thw) split_sizes = (image_grid_thw.prod(-1) // self.vision_tower.spatial_merge_size**2).tolist() @@ -1321,7 +1321,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*): Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2). @@ -1475,7 +1475,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*): Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2). diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index 6a95eefa494d..f1a33fb16687 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -498,7 +498,7 @@ def forward( encoder_hidden_states=None, encoder_attention_mask=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithCrossAttentions: for i, layer_module in enumerate(self.layer): hidden_states = layer_module( hidden_states, @@ -620,7 +620,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`): Indices of input sequence tokens in the vocabulary. @@ -756,7 +756,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -842,7 +842,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -915,7 +915,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py index 7961ea75cd3f..746b06163868 100644 --- a/src/transformers/models/evolla/modeling_evolla.py +++ b/src/transformers/models/evolla/modeling_evolla.py @@ -470,7 +470,7 @@ def forward( encoder_hidden_states=None, encoder_attention_mask=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithCrossAttentions: for i, layer_module in enumerate(self.layer): hidden_states = layer_module( hidden_states, @@ -544,7 +544,7 @@ def forward( input_ids: torch.Tensor | None, attention_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: input_shape = input_ids.size() batch_size, seq_length = input_shape @@ -700,7 +700,9 @@ def __init__(self, config: EvollaConfig): self.sequence_compressor_resampler = EvollaSequenceCompressorResampler(config=config) @can_return_tuple - def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs): + def forward( + self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs + ) -> EvollaProteinEncoderModelOutput: protein_output = self.model(input_ids=input_ids, attention_mask=attention_mask) protein_embeds = protein_output.last_hidden_state sequence_repr = self.sequence_compressor_resampler(protein_embeds, attention_mask) @@ -1319,7 +1321,7 @@ def forward( structure_batch_mask: torch.Tensor | None = None, msa_batch_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" protein_input_ids (torch.LongTensor): The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`. @@ -1430,7 +1432,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ): + ) -> CausalLMOutputWithPast: r""" protein_input_ids (torch.LongTensor): The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`. diff --git a/src/transformers/models/evolla/modular_evolla.py b/src/transformers/models/evolla/modular_evolla.py index c60de607e445..23d0d1a14b12 100644 --- a/src/transformers/models/evolla/modular_evolla.py +++ b/src/transformers/models/evolla/modular_evolla.py @@ -228,7 +228,7 @@ def forward( input_ids: torch.Tensor | None, attention_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: input_shape = input_ids.size() batch_size, seq_length = input_shape @@ -384,7 +384,9 @@ def __init__(self, config: EvollaConfig): self.sequence_compressor_resampler = EvollaSequenceCompressorResampler(config=config) @can_return_tuple - def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs): + def forward( + self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs + ) -> EvollaProteinEncoderModelOutput: protein_output = self.model(input_ids=input_ids, attention_mask=attention_mask) protein_embeds = protein_output.last_hidden_state sequence_repr = self.sequence_compressor_resampler(protein_embeds, attention_mask) @@ -782,7 +784,7 @@ def forward( structure_batch_mask: torch.Tensor | None = None, msa_batch_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" protein_input_ids (torch.LongTensor): The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`. @@ -893,7 +895,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ): + ) -> CausalLMOutputWithPast: r""" protein_input_ids (torch.LongTensor): The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`. diff --git a/src/transformers/models/exaone4/modeling_exaone4.py b/src/transformers/models/exaone4/modeling_exaone4.py index 7e87fbf5a337..3d4f26193a24 100644 --- a/src/transformers/models/exaone4/modeling_exaone4.py +++ b/src/transformers/models/exaone4/modeling_exaone4.py @@ -390,7 +390,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index abab32e8ee8b..fd3d6a1ebdc7 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -339,7 +339,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py index 3bf6616b5453..8a90ba266bf8 100644 --- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py @@ -1289,7 +1289,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, # NOOP kwargs, for now - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1529,7 +1529,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" Example: diff --git a/src/transformers/models/falcon_h1/modular_falcon_h1.py b/src/transformers/models/falcon_h1/modular_falcon_h1.py index 12470ca974bd..442684385204 100644 --- a/src/transformers/models/falcon_h1/modular_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modular_falcon_h1.py @@ -1015,7 +1015,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, # NOOP kwargs, for now - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1239,7 +1239,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" Example: diff --git a/src/transformers/models/fast_vlm/modeling_fast_vlm.py b/src/transformers/models/fast_vlm/modeling_fast_vlm.py index e2e2f2bb90b7..19f921ff1f37 100644 --- a/src/transformers/models/fast_vlm/modeling_fast_vlm.py +++ b/src/transformers/models/fast_vlm/modeling_fast_vlm.py @@ -124,7 +124,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -183,7 +183,7 @@ def forward( vision_feature_select_strategy: str | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | FastVlmModelOutputWithPast: + ) -> FastVlmModelOutputWithPast: r""" vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*): The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the @@ -313,7 +313,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | FastVlmCausalLMOutputWithPast: + ) -> FastVlmCausalLMOutputWithPast: r""" vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*): The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the diff --git a/src/transformers/models/fast_vlm/modular_fast_vlm.py b/src/transformers/models/fast_vlm/modular_fast_vlm.py index fbe891b334fd..b77e8cc5f559 100644 --- a/src/transformers/models/fast_vlm/modular_fast_vlm.py +++ b/src/transformers/models/fast_vlm/modular_fast_vlm.py @@ -180,7 +180,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -215,7 +215,7 @@ def forward( vision_feature_select_strategy: str | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | FastVlmModelOutputWithPast: + ) -> FastVlmModelOutputWithPast: r""" vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*): The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the @@ -288,7 +288,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | FastVlmCausalLMOutputWithPast: + ) -> FastVlmCausalLMOutputWithPast: r""" vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*): The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 87f7766e6631..2ffd53a5d8e0 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -1007,7 +1007,7 @@ def get_text_features( token_type_ids: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -1058,7 +1058,7 @@ def get_image_features( interpolate_pos_encoding: bool | None = None, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`): Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py index f4fe735ce4f5..8953e2d84397 100644 --- a/src/transformers/models/florence2/modeling_florence2.py +++ b/src/transformers/models/florence2/modeling_florence2.py @@ -554,9 +554,7 @@ def __init__(self, config: Florence2VisionConfig): self.post_init() @check_model_inputs - def forward( - self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: for conv, block in zip(self.convs, self.blocks): hidden_states = conv(hidden_states) for layer in block: @@ -688,7 +686,7 @@ def set_input_embeddings(self, value): ) def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -741,7 +739,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Florence2Seq2SeqModelOutput: + ) -> Florence2Seq2SeqModelOutput: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -878,7 +876,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Florence2Seq2SeqLMOutput: + ) -> Florence2Seq2SeqLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py index f94974769f7a..6de41683260e 100644 --- a/src/transformers/models/florence2/modular_florence2.py +++ b/src/transformers/models/florence2/modular_florence2.py @@ -1400,9 +1400,7 @@ def __init__(self, config: Florence2VisionConfig): self.post_init() @check_model_inputs - def forward( - self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: for conv, block in zip(self.convs, self.blocks): hidden_states = conv(hidden_states) for layer in block: @@ -1520,7 +1518,7 @@ def get_encoder(self, modality=None): ) def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -1549,7 +1547,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | Florence2Seq2SeqModelOutput: + ) -> Florence2Seq2SeqModelOutput: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1649,7 +1647,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Florence2Seq2SeqLMOutput: + ) -> Florence2Seq2SeqLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index 63f7be1f27b7..a470a10b082d 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -118,7 +118,7 @@ def gather_continuous_embeddings( @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -269,7 +269,7 @@ def forward( return_dict: bool | None = None, logits_to_keep: int | None = 0, **kwargs, - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*): Image patches to be used as continuous embeddings. The patches are flattened and then projected to the diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 9893d7a0f9a8..77ca169d8107 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -843,7 +843,7 @@ def set_input_embeddings(self, value): @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.") def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) last_hidden_state = vision_outputs.last_hidden_state vision_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) @@ -889,7 +889,7 @@ def forward( labels: torch.LongTensor | None = None, use_cache: bool | None = None, **lm_kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3ModelOutputWithPast: + ) -> Gemma3ModelOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -1029,7 +1029,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3CausalLMOutputWithPast: + ) -> Gemma3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 31424e010f76..91498e86d9b2 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -803,7 +803,7 @@ def __init__(self, config: Gemma3Config): @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.") def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) last_hidden_state = vision_outputs.last_hidden_state vision_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) @@ -825,7 +825,7 @@ def forward( labels: torch.LongTensor | None = None, use_cache: bool | None = None, **lm_kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3ModelOutputWithPast: + ) -> Gemma3ModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -910,7 +910,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3CausalLMOutputWithPast: + ) -> Gemma3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 500e770c3df1..a41c67649eb5 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -944,7 +944,7 @@ def __init__(self, config: Gemma3nAudioConfig): @check_model_inputs def forward( self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: """Encodes a batch of MELs. Args: @@ -1952,7 +1952,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_tower(pixel_values=pixel_values, do_pooling=False, return_dict=True, **kwargs) last_hidden_state = vision_outputs.last_hidden_state # Convert from (batch, channels, height, width) to (batch, height * width, channels) where: @@ -2163,7 +2163,7 @@ def get_audio_features( input_features: torch.Tensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: r""" input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`): The tensors corresponding to the input audio. diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index fd95a8b37366..89e6fc8dd79d 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -1501,7 +1501,7 @@ def __init__(self, config: Gemma3nAudioConfig): @check_model_inputs def forward( self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: """Encodes a batch of MELs. Args: @@ -2219,7 +2219,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_tower(pixel_values=pixel_values, do_pooling=False, return_dict=True, **kwargs) last_hidden_state = vision_outputs.last_hidden_state # Convert from (batch, channels, height, width) to (batch, height * width, channels) where: @@ -2430,7 +2430,7 @@ def get_audio_features( input_features: torch.Tensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: r""" input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`): The tensors corresponding to the input audio. diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 056442142cc0..566d6fcde849 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -777,7 +777,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py index dd11e0d4dfe0..dcf61cf09648 100644 --- a/src/transformers/models/glm4/modeling_glm4.py +++ b/src/transformers/models/glm4/modeling_glm4.py @@ -481,7 +481,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4/modular_glm4.py b/src/transformers/models/glm4/modular_glm4.py index b9ec811fbc3d..f9feb89995e2 100644 --- a/src/transformers/models/glm4/modular_glm4.py +++ b/src/transformers/models/glm4/modular_glm4.py @@ -92,7 +92,7 @@ class Glm4ForCausalLM(GlmForCausalLM): def forward( self, **super_kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm46v/modeling_glm46v.py b/src/transformers/models/glm46v/modeling_glm46v.py index de7b79923826..d7f7fca9d4a0 100644 --- a/src/transformers/models/glm46v/modeling_glm46v.py +++ b/src/transformers/models/glm46v/modeling_glm46v.py @@ -303,7 +303,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -333,7 +333,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -406,7 +406,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm46VModelOutputWithPast: + ) -> Glm46VModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -595,7 +595,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm46VCausalLMOutputWithPast: + ) -> Glm46VCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index fb6de9c68dc6..e2275eb86b5c 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -767,7 +767,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): The final hidden states of the model. @@ -858,7 +858,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1153,7 +1153,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1183,7 +1183,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1256,7 +1256,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vModelOutputWithPast: + ) -> Glm4vModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1445,7 +1445,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vCausalLMOutputWithPast: + ) -> Glm4vCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 0f1d57404d50..97d22db4b5e9 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -786,7 +786,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): The final hidden states of the model. @@ -867,7 +867,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1146,7 +1146,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1227,7 +1227,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vModelOutputWithPast: + ) -> Glm4vModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1341,7 +1341,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vCausalLMOutputWithPast: + ) -> Glm4vCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py index bc84e5b83e00..d3ac4b991ffa 100644 --- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py @@ -833,7 +833,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): The final hidden states of the model. @@ -1001,7 +1001,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1322,7 +1322,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1352,7 +1352,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1425,7 +1425,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vMoeModelOutputWithPast: + ) -> Glm4vMoeModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1667,7 +1667,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vMoeCausalLMOutputWithPast: + ) -> Glm4vMoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index dd59ae412916..75fea0c4f71e 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -415,7 +415,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -509,7 +509,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Glm4vMoeCausalLMOutputWithPast: + ) -> Glm4vMoeCausalLMOutputWithPast: outputs = self.model( input_ids=input_ids, pixel_values=pixel_values, diff --git a/src/transformers/models/glm_image/modeling_glm_image.py b/src/transformers/models/glm_image/modeling_glm_image.py index b694814e82df..33d0a6ac6965 100644 --- a/src/transformers/models/glm_image/modeling_glm_image.py +++ b/src/transformers/models/glm_image/modeling_glm_image.py @@ -720,7 +720,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`): Packed pixel values. @@ -868,7 +868,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1200,7 +1200,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1260,7 +1260,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GlmImageModelOutputWithPast: + ) -> GlmImageModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. diff --git a/src/transformers/models/glm_image/modular_glm_image.py b/src/transformers/models/glm_image/modular_glm_image.py index d2f4604a4e72..14fff7a191c1 100644 --- a/src/transformers/models/glm_image/modular_glm_image.py +++ b/src/transformers/models/glm_image/modular_glm_image.py @@ -589,7 +589,7 @@ def rot_pos_emb(self, grid_thw): @auto_docstring def forward( self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`): Packed pixel values. @@ -909,7 +909,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -967,7 +967,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GlmImageModelOutputWithPast: + ) -> GlmImageModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. diff --git a/src/transformers/models/glmasr/modeling_glmasr.py b/src/transformers/models/glmasr/modeling_glmasr.py index 9a08c0ec1adb..7001d8848a00 100644 --- a/src/transformers/models/glmasr/modeling_glmasr.py +++ b/src/transformers/models/glmasr/modeling_glmasr.py @@ -312,7 +312,7 @@ def __init__(self, config: GlmAsrEncoderConfig): @check_model_inputs @auto_docstring - def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]): + def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: inputs_embeds = nn.functional.gelu(self.conv1(input_features)) inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) inputs_embeds = inputs_embeds.transpose(1, 2) @@ -395,7 +395,7 @@ def get_audio_features( input_features: torch.FloatTensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/glmasr/modular_glmasr.py b/src/transformers/models/glmasr/modular_glmasr.py index a81a6ed7748d..ce2bfc505219 100644 --- a/src/transformers/models/glmasr/modular_glmasr.py +++ b/src/transformers/models/glmasr/modular_glmasr.py @@ -325,7 +325,7 @@ def __init__(self, config: GlmAsrEncoderConfig): @check_model_inputs @auto_docstring - def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]): + def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: inputs_embeds = nn.functional.gelu(self.conv1(input_features)) inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) inputs_embeds = inputs_embeds.transpose(1, 2) @@ -364,7 +364,7 @@ def get_audio_features( input_features: torch.FloatTensor, input_features_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: audio_outputs = self.audio_tower(input_features, return_dict=True, **kwargs) audio_hidden_states = audio_outputs.last_hidden_state audio_hidden_states = audio_hidden_states.reshape( diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py index e5f2d747cf1c..2b5de980c59d 100644 --- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py @@ -437,7 +437,7 @@ def get_input_embeddings(self): @check_model_inputs(tie_last_hidden_states=False) def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | GotOcr2VisionEncoderOutput: + ) -> GotOcr2VisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -556,7 +556,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) last_hidden_state = image_outputs.last_hidden_state image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) @@ -603,7 +603,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GotOcr2ModelOutputWithPast: + ) -> GotOcr2ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -701,7 +701,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GotOcr2CausalLMOutputWithPast: + ) -> GotOcr2CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py index cf7a3e584549..af45aae43c88 100644 --- a/src/transformers/models/got_ocr2/modular_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py @@ -308,7 +308,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) last_hidden_state = image_outputs.last_hidden_state image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state) @@ -329,7 +329,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GotOcr2ModelOutputWithPast: + ) -> GotOcr2ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -393,7 +393,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | GotOcr2CausalLMOutputWithPast: + ) -> GotOcr2CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 4b89bcc9e140..41efb636438c 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -416,7 +416,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`): `input_ids_length` = `sequence_length` if `past_key_values` is `None` else diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 2d9e16419e81..5270f2af55f3 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -551,7 +551,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py index b157af6a1bd2..9cbff5a6787f 100644 --- a/src/transformers/models/gpt_neox/modular_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py @@ -424,7 +424,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index 5f6bf6461141..38775edcf92e 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -297,9 +297,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig): self.post_init() @check_model_inputs - def forward( - self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: hidden_states = self.input_linear(hidden_states) for idx, layer in enumerate(self.layers, start=1): hidden_states = layer(hidden_states, attention_dists=self.attention_dists) @@ -361,7 +359,7 @@ def get_output_embeddings(self): @auto_docstring def get_audio_features( self, input_features: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: audio_outputs = self.encoder(input_features, return_dict=True, **kwargs) projected_embeds = self.projector(audio_outputs.last_hidden_state) audio_outputs.pooler_output = projected_embeds diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index 527b5251d3be..ee2a959f7eca 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -666,7 +666,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/granitemoe/modular_granitemoe.py b/src/transformers/models/granitemoe/modular_granitemoe.py index 6dc38232ec63..8ba86eb96e58 100644 --- a/src/transformers/models/granitemoe/modular_granitemoe.py +++ b/src/transformers/models/granitemoe/modular_granitemoe.py @@ -247,7 +247,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py index 60291ecefbe8..fcb7d1a71226 100644 --- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py @@ -1287,7 +1287,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[GraniteFlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1470,7 +1470,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py index f1930a4cca75..a11121d4d355 100644 --- a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py @@ -221,7 +221,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[GraniteFlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index 3f177aa2475c..6dc665746d21 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -735,7 +735,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | MoeCausalLMOutputWithPast: + ) -> MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index c7b9870cd606..467b7c7ea9c4 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -1236,7 +1236,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1269,7 +1269,7 @@ def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index fa23e47bf4d6..220209a8a25e 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -956,7 +956,7 @@ def forward( interpolate_pos_encoding: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | IdeficsBaseModelOutputWithPast: + ) -> IdeficsBaseModelOutputWithPast: r""" image_encoder_embeddings (`torch.FloatTensor`, *optional*): The output of the image encoder. @@ -1144,7 +1144,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | IdeficsCausalLMOutputWithPast: + ) -> IdeficsCausalLMOutputWithPast: r""" image_encoder_embeddings (`torch.FloatTensor`, *optional*): The output of the image encoder. diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py index 91bdb78c3bed..23026335fe3d 100644 --- a/src/transformers/models/idefics/vision.py +++ b/src/transformers/models/idefics/vision.py @@ -354,7 +354,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index e1ef275d519a..5e51cc21f140 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -469,7 +469,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" patch_attention_mask (`torch.BoolTensor` of shape `(batch_size, num_patches_height, num_patches_width)`, *optional*): The attention mask for the patches. @@ -834,7 +834,7 @@ def get_image_features( pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -906,7 +906,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Idefics2BaseModelOutputWithPast: + ) -> Idefics2BaseModelOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. @@ -1035,7 +1035,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Idefics2CausalLMOutputWithPast: + ) -> Idefics2CausalLMOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index d1ac39b42115..c9b2ffed9cc8 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -476,7 +476,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: batch_size = pixel_values.size(0) if patch_attention_mask is None: patch_size = self.patch_size @@ -577,7 +577,7 @@ def get_image_features( pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -650,7 +650,7 @@ def forward( cache_position: torch.LongTensor | None = None, return_dict: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Idefics3BaseModelOutputWithPast: + ) -> Idefics3BaseModelOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. @@ -790,7 +790,7 @@ def forward( return_dict: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Idefics3CausalLMOutputWithPast: + ) -> Idefics3CausalLMOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 78ab67d2c09a..397f4b5b24c2 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -414,7 +414,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -713,7 +713,7 @@ def forward( encoder_attention_mask=None, query_length=0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithPastAndCrossAttentions: for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] @@ -872,7 +872,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" query_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Hidden states to be used in the attention computation. If cross-attention, @@ -1019,7 +1019,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | InstructBlipForConditionalGenerationModelOutput: + ) -> InstructBlipForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided @@ -1197,7 +1197,7 @@ def get_image_features( qformer_attention_mask: torch.LongTensor | None = None, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisionQformerOutputs: + ) -> BaseModelOutputWithVisionQformerOutputs: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1285,7 +1285,7 @@ def forward( labels: torch.LongTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InstructBlipForConditionalGenerationModelOutput: + ) -> InstructBlipForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index 121db617af5c..7c8af79e7dc1 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -419,7 +419,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -713,7 +713,7 @@ def forward( encoder_attention_mask=None, query_length=0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutputWithPastAndCrossAttentions: for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] @@ -822,7 +822,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" query_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Hidden states to be used in the attention computation. If cross-attention, @@ -1008,7 +1008,7 @@ def forward( interpolate_pos_encoding: bool = False, use_cache: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput: + ) -> InstructBlipVideoForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided @@ -1253,7 +1253,7 @@ def forward( interpolate_pos_encoding: bool = False, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput: + ) -> InstructBlipVideoForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)): The sequence used as a prompt to be fed to the Q-Former module. @@ -1462,7 +1462,7 @@ def get_video_features( qformer_attention_mask: torch.LongTensor | None = None, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisionQformerOutputs: + ) -> BaseModelOutputWithVisionQformerOutputs: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 8b76bfc6fba1..6e0a58f12439 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -192,7 +192,7 @@ def forward( interpolate_pos_encoding: bool = False, use_cache: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput: + ) -> InstructBlipVideoForConditionalGenerationModelOutput: return_dict = return_dict if return_dict is not None else self.config.use_return_dict # step 1: forward the images through the vision encoder, @@ -294,7 +294,7 @@ def get_video_features( qformer_attention_mask: torch.LongTensor | None = None, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisionQformerOutputs: + ) -> BaseModelOutputWithVisionQformerOutputs: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -393,7 +393,7 @@ def forward( interpolate_pos_encoding: bool = False, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput: + ) -> InstructBlipVideoForConditionalGenerationModelOutput: r""" qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)): The sequence used as a prompt to be fed to the Q-Former module. diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py index 498caae1044e..95e972c2c566 100644 --- a/src/transformers/models/internvl/modeling_internvl.py +++ b/src/transformers/models/internvl/modeling_internvl.py @@ -447,7 +447,7 @@ def get_input_embeddings(self): @auto_docstring def forward( self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor | None = None, **kwargs - ) -> tuple | InternVLVisionModelOutputWithPooling: + ) -> InternVLVisionModelOutputWithPooling: r""" bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). @@ -554,7 +554,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. @@ -632,7 +632,7 @@ def forward( vision_feature_select_strategy: str | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InternVLModelOutputWithPast: + ) -> InternVLModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -796,7 +796,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InternVLCausalLMOutputWithPast: + ) -> InternVLCausalLMOutputWithPast: r""" Example: diff --git a/src/transformers/models/internvl/modular_internvl.py b/src/transformers/models/internvl/modular_internvl.py index d5ec73e498da..46736f29b547 100644 --- a/src/transformers/models/internvl/modular_internvl.py +++ b/src/transformers/models/internvl/modular_internvl.py @@ -401,7 +401,7 @@ def get_input_embeddings(self): @auto_docstring def forward( self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor | None = None, **kwargs - ) -> tuple | InternVLVisionModelOutputWithPooling: + ) -> InternVLVisionModelOutputWithPooling: r""" bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). @@ -494,7 +494,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, vision_feature_select_strategy: str | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. @@ -548,7 +548,7 @@ def forward( vision_feature_select_strategy: str | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | InternVLModelOutputWithPast: + ) -> InternVLModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py index 7ef79803672f..b87bde2a9208 100644 --- a/src/transformers/models/janus/modeling_janus.py +++ b/src/transformers/models/janus/modeling_janus.py @@ -459,7 +459,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1001,7 +1001,7 @@ def set_input_embeddings(self, value): @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_model(pixel_values, return_dict=True, **kwargs) vision_outputs.pooler_output = self.aligner(vision_outputs.last_hidden_state) diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index c636b69b47a4..3ea7811355ba 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -566,7 +566,7 @@ def forward( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -942,7 +942,7 @@ def set_input_embeddings(self, value): @auto_docstring def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: vision_outputs = self.vision_model(pixel_values, return_dict=True, **kwargs) vision_outputs.pooler_output = self.aligner(vision_outputs.last_hidden_state) diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index bcfff60f0b02..e26866c9ddf4 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -448,7 +448,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -1264,7 +1264,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*): Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`. @@ -1339,7 +1339,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*): Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`. @@ -1514,7 +1514,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool | None = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithProjectionAttentions: + ) -> BaseModelOutputWithProjectionAttentions: if "return_attentions" in kwargs: warnings.warn( "`return_attentions` is deprecated and will be removed in a future version. Please use `return_dict`" @@ -1557,7 +1557,7 @@ def forward( interpolate_pos_encoding: bool = False, return_dict: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | Kosmos2ModelOutput: + ) -> Kosmos2ModelOutput: r""" image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0, @@ -1693,7 +1693,7 @@ def forward( output_hidden_states: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Kosmos2ForConditionalGenerationModelOutput: + ) -> Kosmos2ForConditionalGenerationModelOutput: r""" image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0, diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 677f6eb75859..c27593c29825 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -326,7 +326,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -466,7 +466,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, @@ -602,7 +602,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, @@ -719,7 +719,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, @@ -854,7 +854,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, @@ -968,7 +968,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*): Bounding boxes of each input sequence tokens. Selected in the range `[0, diff --git a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py index 5ff43fadaa10..902e868eacc4 100755 --- a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py @@ -171,7 +171,7 @@ def get_image_features( spatial_shapes: torch.Tensor, pixel_attention_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -250,7 +250,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Lfm2VlModelOutputWithPast: + ) -> Lfm2VlModelOutputWithPast: r""" spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): The spatial shapes of the input images. @@ -361,7 +361,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Lfm2VlCausalLMOutputWithPast: + ) -> Lfm2VlCausalLMOutputWithPast: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`, *optional*): The input image tensors. diff --git a/src/transformers/models/lfm2_vl/modular_lfm2_vl.py b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py index 2ff5a055fff9..70abc69756a4 100644 --- a/src/transformers/models/lfm2_vl/modular_lfm2_vl.py +++ b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py @@ -102,7 +102,7 @@ def get_image_features( spatial_shapes: torch.Tensor, pixel_attention_mask: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -181,7 +181,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Lfm2VlModelOutputWithPast: + ) -> Lfm2VlModelOutputWithPast: r""" spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*): The spatial shapes of the input images. @@ -271,7 +271,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Lfm2VlCausalLMOutputWithPast: + ) -> Lfm2VlCausalLMOutputWithPast: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`, *optional*): The input image tensors. diff --git a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py index c95633254ae9..03918c4c59b2 100644 --- a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py +++ b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py @@ -175,7 +175,7 @@ def set_input_embeddings(self, value): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, image_sizes: torch.Tensor | list, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_encoder(pixel_values, image_sizes=image_sizes, return_dict=True, **kwargs) image_features = image_outputs.last_hidden_state image_features = self.vision_projection(image_features.squeeze(0), image_sizes) @@ -229,7 +229,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LightOnOcrModelOutputWithPast: + ) -> LightOnOcrModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -353,7 +353,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LightOnOcrCausalLMOutputWithPast: + ) -> LightOnOcrCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/lighton_ocr/modular_lighton_ocr.py b/src/transformers/models/lighton_ocr/modular_lighton_ocr.py index 310118ac1a4b..ea347fccb3ec 100644 --- a/src/transformers/models/lighton_ocr/modular_lighton_ocr.py +++ b/src/transformers/models/lighton_ocr/modular_lighton_ocr.py @@ -310,7 +310,7 @@ def __init__(self, config: LightOnOcrConfig): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, image_sizes: torch.Tensor | list, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_encoder(pixel_values, image_sizes=image_sizes, return_dict=True, **kwargs) image_features = image_outputs.last_hidden_state image_features = self.vision_projection(image_features.squeeze(0), image_sizes) @@ -340,7 +340,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LightOnOcrModelOutputWithPast: + ) -> LightOnOcrModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index cbc8b42e9114..c1c29bf604fa 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -535,7 +535,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -625,7 +625,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -1214,7 +1214,7 @@ def get_image_features( pixel_values: torch.FloatTensor, vision_feature_select_strategy: str, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. @@ -1267,7 +1267,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Llama4CausalLMOutputWithPast: + ) -> Llama4CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 9fbd58c786df..2431c2ab6020 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -157,7 +157,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -236,7 +236,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaModelOutputWithPast: + ) -> LlavaModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -336,7 +336,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaCausalLMOutputWithPast: + ) -> LlavaCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 311f958005b5..6be577f6fab8 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -356,7 +356,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) The tensors corresponding to the input images. @@ -460,7 +460,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaNextModelOutputWithPast: + ) -> LlavaNextModelOutputWithPast: r""" vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. @@ -603,7 +603,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaNextCausalLMOutputWithPast: + ) -> LlavaNextCausalLMOutputWithPast: r""" vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 687899c8f5cd..90b7b7771e11 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -408,7 +408,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) The tensors corresponding to the input images. @@ -529,7 +529,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaNextVideoModelOutputWithPast: + ) -> LlavaNextVideoModelOutputWithPast: r""" vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. @@ -613,7 +613,7 @@ def get_video_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. @@ -744,7 +744,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaNextVideoCausalLMOutputWithPast: + ) -> LlavaNextVideoCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 345798810cb3..1d5427eda671 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -282,7 +282,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`) The tensors corresponding to the input images. @@ -353,7 +353,7 @@ def get_video_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. @@ -454,7 +454,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaNextVideoModelOutputWithPast: + ) -> LlavaNextVideoModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -571,7 +571,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaNextVideoCausalLMOutputWithPast: + ) -> LlavaNextVideoCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 3c7931e2e4ce..d950a8c59256 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -367,7 +367,7 @@ def get_image_features( batch_num_images: torch.LongTensor | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" image_sizes (`torch.Tensor` of shape `(num_images, 2)`): Actual image size of each images (H, W). @@ -493,7 +493,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaOnevisionModelOutputWithPast: + ) -> LlavaOnevisionModelOutputWithPast: r""" image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*): The sizes of the videos in the batch, being (height, width) for each frame in the video. @@ -583,7 +583,7 @@ def get_video_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. @@ -729,7 +729,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaOnevisionCausalLMOutputWithPast: + ) -> LlavaOnevisionCausalLMOutputWithPast: r""" image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*): The sizes of the videos in the batch, being (height, width) for each frame in the video. diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index 6c49d02662f2..692d76a9fc45 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -324,7 +324,7 @@ def get_image_features( batch_num_images: torch.LongTensor | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" image_sizes (`torch.Tensor` of shape `(num_images, 2)`): Actual image size of each images (H, W). @@ -397,7 +397,7 @@ def get_video_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input video. @@ -459,7 +459,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | LlavaOnevisionModelOutputWithPast: + ) -> LlavaOnevisionModelOutputWithPast: r""" image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*): The sizes of the videos in the batch, being (height, width) for each frame in the video. @@ -565,7 +565,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | LlavaOnevisionCausalLMOutputWithPast: + ) -> LlavaOnevisionCausalLMOutputWithPast: r""" image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*): The sizes of the videos in the batch, being (height, width) for each frame in the video. diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index 0a01c4b556f4..c4a0fa5e6c80 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -474,7 +474,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -558,7 +558,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*): Tag IDs for each token in the input sequence, padded up to config.max_depth. @@ -666,7 +666,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*): Tag IDs for each token in the input sequence, padded up to config.max_depth. @@ -782,7 +782,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*): Tag IDs for each token in the input sequence, padded up to config.max_depth. @@ -885,7 +885,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*): Tag IDs for each token in the input sequence, padded up to config.max_depth. diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index 2440e5fcb58f..5571259d128e 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -799,7 +799,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -831,7 +831,7 @@ def get_image_features( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index 4fc1aa6e3d6a..d7eb26d99ab6 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -563,7 +563,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -591,7 +591,7 @@ def get_image_features( pixel_values: torch.FloatTensor | None = None, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/minimax/modeling_minimax.py b/src/transformers/models/minimax/modeling_minimax.py index a2dff7e9401b..841c81694c8c 100644 --- a/src/transformers/models/minimax/modeling_minimax.py +++ b/src/transformers/models/minimax/modeling_minimax.py @@ -654,7 +654,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index c05e7db364f5..5e2f67bc8150 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -546,7 +546,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py index 03112e6fb77d..76a7ff7b9582 100644 --- a/src/transformers/models/mistral3/modeling_mistral3.py +++ b/src/transformers/models/mistral3/modeling_mistral3.py @@ -226,7 +226,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -296,7 +296,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Mistral3ModelOutputWithPast: + ) -> Mistral3ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -407,7 +407,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Mistral3CausalLMOutputWithPast: + ) -> Mistral3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/mistral3/modular_mistral3.py b/src/transformers/models/mistral3/modular_mistral3.py index 7ebf180b3e3e..2f16bac7b3cf 100644 --- a/src/transformers/models/mistral3/modular_mistral3.py +++ b/src/transformers/models/mistral3/modular_mistral3.py @@ -131,7 +131,7 @@ def get_image_features( vision_feature_layer: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: kwargs = {k: v for k, v in kwargs.items() if v is not None} # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. image_outputs = self.vision_tower( @@ -177,7 +177,7 @@ def forward( cache_position: torch.LongTensor | None = None, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Mistral3ModelOutputWithPast: + ) -> Mistral3ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -258,7 +258,7 @@ def forward( logits_to_keep: int | torch.Tensor = 0, image_sizes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Mistral3CausalLMOutputWithPast: + ) -> Mistral3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index a3f2498a9192..e0169855ad2c 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -528,7 +528,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Example: diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py index f572d2758716..040e7bab69b4 100644 --- a/src/transformers/models/mlcd/modular_mlcd.py +++ b/src/transformers/models/mlcd/modular_mlcd.py @@ -439,7 +439,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Example: diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index 5716e6c3d6e5..cda5b30ce38b 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1360,7 +1360,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" cross_attention_states (`torch.FloatTensor`, *optional*): Output of the vision model, used for cross-attention. This tensor contains the processed image features that @@ -1618,7 +1618,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" aspect_ratio_mask (`torch.Tensor` of shape `(batch_size, max_num_images, max_num_tiles)`, *optional*): Mask to avoid performing attention on padding tiles. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index fe25e0b139c8..bef2d879d5f6 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -622,7 +622,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -700,7 +700,7 @@ def forward( labels: torch.LongTensor | None = None, next_sentence_label: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MobileBertForPreTrainingOutput: + ) -> MobileBertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -798,7 +798,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -867,7 +867,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | NextSentencePredictorOutput: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -954,7 +954,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1032,7 +1032,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.mobilebert( input_ids, attention_mask=attention_mask, @@ -1103,7 +1103,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1203,7 +1203,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. diff --git a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py index f0a6d343b99c..af6581968093 100644 --- a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py @@ -484,7 +484,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor, ...] | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) == (inputs_embeds is None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -589,7 +589,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -692,7 +692,7 @@ def forward( labels: torch.LongTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutputWithPast: + ) -> SequenceClassifierOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index e4005404e556..6c3eec505d84 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -535,7 +535,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor, ...] | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) == (inputs_embeds is None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -640,7 +640,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -743,7 +743,7 @@ def forward( labels: torch.LongTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutputWithPast: + ) -> SequenceClassifierOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 7d8731e0ddc0..001c0f735db8 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -556,7 +556,7 @@ def forward( input_values: torch.FloatTensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" Args: input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): @@ -648,7 +648,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 6517494ffe7f..f0f15232d3f5 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -519,7 +519,7 @@ def forward( input_values: torch.FloatTensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" Args: input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): @@ -601,7 +601,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index 2cc7aba473c6..ce36968f7ca2 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -719,7 +719,7 @@ def forward( attention_mask: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> MoEModelOutput: if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -808,7 +808,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -916,7 +916,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | Seq2SeqMoEModelOutput: + ) -> Seq2SeqMoEModelOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, @@ -1088,7 +1088,7 @@ def forward( output_router_logits: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | Seq2SeqMoEOutput: + ) -> Seq2SeqMoEOutput: output_router_logits = ( output_router_logits if output_router_logits is not None else self.config.output_router_logits ) diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index 771ea6b0efe8..b25f63c6f123 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -488,7 +488,7 @@ def forward( position_ids: torch.LongTensor | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): @@ -670,7 +670,7 @@ def forward( position_ids: torch.LongTensor | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -737,7 +737,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/ovis2/modeling_ovis2.py b/src/transformers/models/ovis2/modeling_ovis2.py index 57e5bcd305cb..8e5d57ff6372 100644 --- a/src/transformers/models/ovis2/modeling_ovis2.py +++ b/src/transformers/models/ovis2/modeling_ovis2.py @@ -343,7 +343,7 @@ def forward( pixel_values, attention_mask: torch.Tensor | None = None, **kwargs, - ): + ) -> BaseModelOutput: hidden_states = self.embeddings(pixel_values) encoder_outputs: BaseModelOutput = self.encoder( @@ -421,7 +421,7 @@ def __init__(self, config: Ovis2VisionConfig): @check_model_inputs def forward( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures: + ) -> BaseModelOutputWithVisualIndicatorFeatures: outputs = self.transformer(pixel_values, **kwargs) last_hidden_state = outputs[0] if self.config.hidden_stride > 1: @@ -493,7 +493,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures: + ) -> BaseModelOutputWithVisualIndicatorFeatures: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) image_features = image_outputs.pooler_output batch_size, img_seq_len, _ = image_features.shape @@ -559,7 +559,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | Ovis2ModelOutputWithPast: + ) -> Ovis2ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -666,7 +666,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | Ovis2CausalLMOutputWithPast: + ) -> Ovis2CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/ovis2/modular_ovis2.py b/src/transformers/models/ovis2/modular_ovis2.py index aa659655fb49..995d37512c6a 100644 --- a/src/transformers/models/ovis2/modular_ovis2.py +++ b/src/transformers/models/ovis2/modular_ovis2.py @@ -136,7 +136,7 @@ def forward( pixel_values, attention_mask: torch.Tensor | None = None, **kwargs, - ): + ) -> BaseModelOutput: hidden_states = self.embeddings(pixel_values) encoder_outputs: BaseModelOutput = self.encoder( @@ -204,7 +204,7 @@ def __init__(self, config: Ovis2VisionConfig): @check_model_inputs def forward( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures: + ) -> BaseModelOutputWithVisualIndicatorFeatures: outputs = self.transformer(pixel_values, **kwargs) last_hidden_state = outputs[0] if self.config.hidden_stride > 1: @@ -265,7 +265,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures: + ) -> BaseModelOutputWithVisualIndicatorFeatures: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) image_features = image_outputs.pooler_output batch_size, img_seq_len, _ = image_features.shape @@ -307,7 +307,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | Ovis2ModelOutputWithPast: + ) -> Ovis2ModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -402,7 +402,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs, - ) -> tuple | Ovis2CausalLMOutputWithPast: + ) -> Ovis2CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index ded895ebb814..1c2e4fe2411f 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -981,7 +981,7 @@ def get_text_features( input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -1020,7 +1020,7 @@ def get_image_features( pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: ```python diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index bbda1109fd3e..fe00c0cb0892 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -962,7 +962,7 @@ def get_text_features( input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -1001,7 +1001,7 @@ def get_image_features( pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: ```python diff --git a/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py index f813ac0c10ea..cf5ec6bf1d3f 100644 --- a/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py @@ -958,7 +958,7 @@ def forward( cu_seqlens: torch.Tensor, image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]] | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, image_channels, patch_size, patch_size)`): @@ -1208,7 +1208,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1276,7 +1276,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | PaddleOCRVLModelOutputWithPast: + ) -> PaddleOCRVLModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1386,7 +1386,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | PaddleOCRVLCausalLMOutputWithPast: + ) -> PaddleOCRVLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py index 6a320b66440d..6ca518535068 100644 --- a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py +++ b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py @@ -1077,7 +1077,7 @@ def forward( cu_seqlens: torch.Tensor, image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]] | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, image_channels, patch_size, patch_size)`): @@ -1131,7 +1131,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1199,7 +1199,7 @@ def forward( rope_deltas: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | PaddleOCRVLModelOutputWithPast: + ) -> PaddleOCRVLModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1283,7 +1283,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | PaddleOCRVLCausalLMOutputWithPast: + ) -> PaddleOCRVLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index f5998f0607b6..a9059351fc02 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -264,7 +264,7 @@ def set_input_embeddings(self, value): ) def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs) selected_image_feature = image_outputs.last_hidden_state image_features = self.multi_modal_projector(selected_image_feature) @@ -315,7 +315,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | PaligemmaModelOutputWithPast: + ) -> PaligemmaModelOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., @@ -467,7 +467,7 @@ def forward( return_dict: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | PaliGemmaCausalLMOutputWithPast: + ) -> PaliGemmaCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/pe_audio/modeling_pe_audio.py b/src/transformers/models/pe_audio/modeling_pe_audio.py index 948cd6e1fd16..0daf26ea1649 100644 --- a/src/transformers/models/pe_audio/modeling_pe_audio.py +++ b/src/transformers/models/pe_audio/modeling_pe_audio.py @@ -642,7 +642,7 @@ def forward( input_values: torch.Tensor, padding_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: inputs_embeds, padding_mask = self.embedder(input_values, padding_mask=padding_mask) inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask) diff --git a/src/transformers/models/pe_audio/modular_pe_audio.py b/src/transformers/models/pe_audio/modular_pe_audio.py index 84a1ad80df85..b5e594393344 100644 --- a/src/transformers/models/pe_audio/modular_pe_audio.py +++ b/src/transformers/models/pe_audio/modular_pe_audio.py @@ -117,7 +117,7 @@ def forward( input_values: torch.Tensor, padding_mask: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: inputs_embeds, padding_mask = self.embedder(input_values, padding_mask=padding_mask) inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask) diff --git a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py index 0fb693d67941..bf47a4ea519d 100644 --- a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py +++ b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py @@ -589,7 +589,7 @@ def forward( padding_mask: torch.Tensor | None = None, padding_mask_videos: torch.Tensor | None = None, **kwargs, - ) -> tuple | PeAudioVideoEncoderOutput: + ) -> PeAudioVideoEncoderOutput: inputs_embeds, padding_mask, audio_output, video_output = self.embedder( input_values, pixel_values_videos, diff --git a/src/transformers/models/pe_audio_video/modular_pe_audio_video.py b/src/transformers/models/pe_audio_video/modular_pe_audio_video.py index 78bd0a044259..7127de028c6a 100644 --- a/src/transformers/models/pe_audio_video/modular_pe_audio_video.py +++ b/src/transformers/models/pe_audio_video/modular_pe_audio_video.py @@ -378,7 +378,7 @@ def forward( padding_mask: torch.Tensor | None = None, padding_mask_videos: torch.Tensor | None = None, **kwargs, - ) -> tuple | PeAudioVideoEncoderOutput: + ) -> PeAudioVideoEncoderOutput: inputs_embeds, padding_mask, audio_output, video_output = self.embedder( input_values, pixel_values_videos, diff --git a/src/transformers/models/pe_video/modeling_pe_video.py b/src/transformers/models/pe_video/modeling_pe_video.py index a94e53b77dc4..7ab2e524165f 100644 --- a/src/transformers/models/pe_video/modeling_pe_video.py +++ b/src/transformers/models/pe_video/modeling_pe_video.py @@ -526,7 +526,7 @@ def forward( pixel_values_videos: torch.Tensor, padding_mask_videos: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: inputs_embeds, padding_mask = self.embedder(pixel_values_videos, padding_mask=padding_mask_videos) inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask) @@ -577,7 +577,7 @@ def get_text_features( input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, @@ -594,7 +594,7 @@ def get_video_features( pixel_values_videos: torch.Tensor, padding_mask_videos: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: video_outputs: BaseModelOutputWithPooling = self.video_encoder( pixel_values_videos=pixel_values_videos, padding_mask_videos=padding_mask_videos, diff --git a/src/transformers/models/pe_video/modular_pe_video.py b/src/transformers/models/pe_video/modular_pe_video.py index 38576598cf44..b7bc4e1d3de1 100644 --- a/src/transformers/models/pe_video/modular_pe_video.py +++ b/src/transformers/models/pe_video/modular_pe_video.py @@ -107,7 +107,7 @@ def forward( pixel_values_videos: torch.Tensor, padding_mask_videos: torch.Tensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: inputs_embeds, padding_mask = self.embedder(pixel_values_videos, padding_mask=padding_mask_videos) inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask) @@ -158,7 +158,7 @@ def get_text_features( input_ids: torch.Tensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, @@ -175,7 +175,7 @@ def get_video_features( pixel_values_videos: torch.Tensor, padding_mask_videos: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: video_outputs: BaseModelOutputWithPooling = self.video_encoder( pixel_values_videos=pixel_values_videos, padding_mask_videos=padding_mask_videos, diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py index 7a202554a6eb..1e90f4f36328 100644 --- a/src/transformers/models/perception_lm/modeling_perception_lm.py +++ b/src/transformers/models/perception_lm/modeling_perception_lm.py @@ -187,7 +187,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values.flatten(0, 1), return_dict=True, **kwargs) last_hidden_state = image_outputs.last_hidden_state if self.config.vision_use_cls_token: @@ -255,7 +255,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | PerceptionLMModelOutputWithPast: + ) -> PerceptionLMModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -349,7 +349,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | PerceptionLMCausalLMOutputWithPast: + ) -> PerceptionLMCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py index ccd335a84ae6..97842903d36c 100644 --- a/src/transformers/models/perception_lm/modular_perception_lm.py +++ b/src/transformers/models/perception_lm/modular_perception_lm.py @@ -155,7 +155,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: image_outputs = self.vision_tower(pixel_values.flatten(0, 1), return_dict=True, **kwargs) last_hidden_state = image_outputs.last_hidden_state if self.config.vision_use_cls_token: @@ -223,7 +223,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | PerceptionLMModelOutputWithPast: + ) -> PerceptionLMModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -336,7 +336,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | PerceptionLMCausalLMOutputWithPast: + ) -> PerceptionLMCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index 38995c5f167d..5e4aa50bb47d 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -447,7 +447,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: batch_size = pixel_values.size(0) if patch_attention_mask is None: patch_attention_mask = torch.ones( @@ -1568,7 +1568,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" image_pixel_values (`torch.FloatTensor`, *optional*): If the input contains images, these correspond to the pixel values after transformations (as returned by diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index 2043cd0077e9..ccd202c62785 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -688,7 +688,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: batch_size = pixel_values.size(0) if patch_attention_mask is None: patch_attention_mask = torch.ones( @@ -1505,7 +1505,7 @@ def forward( output_hidden_states: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs, - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" image_pixel_values (`torch.FloatTensor`, *optional*): If the input contains images, these correspond to the pixel values after transformations (as returned by diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py index 200871240817..315a8345b3cd 100644 --- a/src/transformers/models/pixtral/modeling_pixtral.py +++ b/src/transformers/models/pixtral/modeling_pixtral.py @@ -487,7 +487,7 @@ def forward( return_dict: bool | None = None, *args, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: if image_sizes is None: batch_size, _, height, width = pixel_values.shape image_sizes = [(height, width)] * batch_size diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index 24633e9ae0bc..ec9123b73bd7 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -844,7 +844,9 @@ def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch @check_model_inputs(tie_last_hidden_states=False) @auto_docstring - def forward(self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs]): + def forward( + self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs] + ) -> BaseModelOutputWithPooling: r""" feature_lens (`torch.LongTensor` of shape `(batch_size,)`): mel length @@ -1254,7 +1256,7 @@ def get_window_index(self, grid_thw): @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -1806,7 +1808,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1823,7 +1825,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1841,7 +1843,7 @@ def get_audio_features( feature_attention_mask: torch.LongTensor | None = None, audio_feature_lengths: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): The tensors corresponding to the input audios. diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 6ec82e156cda..f251a92dea67 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -1725,7 +1725,9 @@ def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch @check_model_inputs(tie_last_hidden_states=False) @auto_docstring - def forward(self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs]): + def forward( + self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs] + ) -> BaseModelOutputWithPooling: r""" feature_lens (`torch.LongTensor` of shape `(batch_size,)`): mel length @@ -1964,7 +1966,7 @@ def __init__(self, config: Qwen2_5OmniVisionEncoderConfig, *inputs, **kwargs) -> @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -2112,7 +2114,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -2129,7 +2131,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -2147,7 +2149,7 @@ def get_audio_features( feature_attention_mask: torch.LongTensor | None = None, audio_feature_lengths: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): The tensors corresponding to the input audios. diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 29f66d6cd204..80ccd7f47309 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -422,7 +422,7 @@ def get_window_index(self, grid_thw): @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -1185,7 +1185,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1207,7 +1207,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1468,7 +1468,7 @@ def forward( second_per_grid_ts: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen2_5_VLCausalLMOutputWithPast: + ) -> Qwen2_5_VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 6e266e062f16..87fa72915459 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -285,7 +285,7 @@ def get_window_index(self, grid_thw): @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -676,7 +676,7 @@ def forward( second_per_grid_ts: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen2_5_VLCausalLMOutputWithPast: + ) -> Qwen2_5_VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 7eb1829d17c4..508ababbdf3d 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1119,7 +1119,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1141,7 +1141,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1364,7 +1364,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen2VLCausalLMOutputWithPast: + ) -> Qwen2VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index cfcfcec4e2c7..d0bee4c64598 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -759,7 +759,7 @@ def forward( feature_lens=None, aftercnn_lens=None, **kwargs, - ): + ) -> BaseModelOutputWithPooling: r""" feature_lens (`torch.LongTensor` of shape `(batch_size,)`): mel length @@ -1241,7 +1241,7 @@ def fast_pos_embed_interpolate(self, grid_thw): @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -1758,7 +1758,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -1989,7 +1989,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -2006,7 +2006,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -2024,7 +2024,7 @@ def get_audio_features( feature_attention_mask: torch.LongTensor | None = None, audio_feature_lengths: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): The tensors corresponding to the input audios. @@ -2123,7 +2123,7 @@ def forward( cache_position=None, video_second_per_grid=None, **kwargs, - ) -> tuple | Qwen3OmniMoeThinkerCausalLMOutputWithPast: + ) -> Qwen3OmniMoeThinkerCausalLMOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -3015,7 +3015,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 9bb913b232b3..ff42a2357c00 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -1460,7 +1460,7 @@ def get_audio_features( feature_attention_mask: torch.LongTensor | None = None, audio_feature_lengths: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): The tensors corresponding to the input audios. @@ -1509,7 +1509,7 @@ def forward( cache_position=None, video_second_per_grid=None, **kwargs, - ) -> tuple | Qwen3OmniMoeThinkerCausalLMOutputWithPast: + ) -> Qwen3OmniMoeThinkerCausalLMOutputWithPast: output_router_logits = ( output_router_logits if output_router_logits is not None else self.config.text_config.output_router_logits ) diff --git a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py index a06ab3e2ca7c..ae4fb4d2647e 100644 --- a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -757,7 +757,7 @@ def fast_pos_embed_interpolate(self, grid_thw): @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -855,7 +855,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -1095,7 +1095,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1112,7 +1112,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1186,7 +1186,7 @@ def forward( video_grid_thw: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLModelOutputWithPast: + ) -> Qwen3VLModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1387,7 +1387,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLCausalLMOutputWithPast: + ) -> Qwen3VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 8c38b4391c57..6c72f4117407 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -655,7 +655,7 @@ def fast_pos_embed_interpolate(self, grid_thw): @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -751,7 +751,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -969,7 +969,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -994,7 +994,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1019,7 +1019,7 @@ def forward( video_grid_thw: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLModelOutputWithPast: + ) -> Qwen3VLModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1155,7 +1155,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLCausalLMOutputWithPast: + ) -> Qwen3VLCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index d697286d0eca..84a5ab8cd47c 100644 --- a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -747,7 +747,7 @@ def fast_pos_embed_interpolate(self, grid_thw): @check_model_inputs def forward( self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: """ Args: hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`): @@ -934,7 +934,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -1228,7 +1228,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1245,7 +1245,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithDeepstackFeatures: + ) -> BaseModelOutputWithDeepstackFeatures: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1319,7 +1319,7 @@ def forward( video_grid_thw: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLMoeModelOutputWithPast: + ) -> Qwen3VLMoeModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1573,7 +1573,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLMoeCausalLMOutputWithPast: + ) -> Qwen3VLMoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index 946e972d0fd9..c7a7e5e32617 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -362,7 +362,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> Qwen3VLMoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index f05ef121efc8..04c735cf1510 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -606,7 +606,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -751,7 +751,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -861,7 +861,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -957,7 +957,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1041,7 +1041,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1142,7 +1142,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1232,7 +1232,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/roberta/modular_roberta.py b/src/transformers/models/roberta/modular_roberta.py index ef1641fbccd7..84761658b9b0 100644 --- a/src/transformers/models/roberta/modular_roberta.py +++ b/src/transformers/models/roberta/modular_roberta.py @@ -225,7 +225,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -335,7 +335,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -431,7 +431,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -515,7 +515,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -616,7 +616,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -706,7 +706,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index eaa1a4561f6c..ccba796c6cab 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -619,7 +619,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -781,7 +781,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -897,7 +897,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -995,7 +995,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1080,7 +1080,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1182,7 +1182,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1274,7 +1274,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index a43e603a23aa..58b46f2aa51c 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -692,7 +692,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -858,7 +858,7 @@ def forward( labels_attention_mask: torch.Tensor | None = None, labels_token_type_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1054,7 +1054,7 @@ def forward( encoder_attention_mask: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1213,7 +1213,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1350,7 +1350,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1448,7 +1448,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1572,7 +1572,7 @@ def forward( inputs_embeds: torch.Tensor | None = None, labels: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. @@ -1648,7 +1648,7 @@ def forward( start_positions: torch.Tensor | None = None, end_positions: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the shape vocabulary. diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index ef1181ba641a..18b5a86eb8c9 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -1056,7 +1056,7 @@ def get_input_embeddings(self): @check_model_inputs(tie_last_hidden_states=False) def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | SamVisionEncoderOutput: + ) -> SamVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/sam2/modeling_sam2.py b/src/transformers/models/sam2/modeling_sam2.py index 7d93142aff65..e0c11cb564e9 100644 --- a/src/transformers/models/sam2/modeling_sam2.py +++ b/src/transformers/models/sam2/modeling_sam2.py @@ -623,7 +623,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2HieraDetModelOutput: + ) -> Sam2HieraDetModelOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -675,7 +675,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VisionEncoderOutput: + ) -> Sam2VisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1569,7 +1569,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VisionEncoderOutput: + ) -> Sam2VisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py index 42f30cafd05a..3d0e8e530a36 100644 --- a/src/transformers/models/sam2/modular_sam2.py +++ b/src/transformers/models/sam2/modular_sam2.py @@ -728,7 +728,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2HieraDetModelOutput: + ) -> Sam2HieraDetModelOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -780,7 +780,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VisionEncoderOutput: + ) -> Sam2VisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1246,7 +1246,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VisionEncoderOutput: + ) -> Sam2VisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam2_video/modeling_sam2_video.py b/src/transformers/models/sam2_video/modeling_sam2_video.py index 12a9dfc4d1be..9a626a1a4f8d 100644 --- a/src/transformers/models/sam2_video/modeling_sam2_video.py +++ b/src/transformers/models/sam2_video/modeling_sam2_video.py @@ -1839,7 +1839,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam2VideoVisionEncoderOutput: + ) -> Sam2VideoVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam3/modeling_sam3.py b/src/transformers/models/sam3/modeling_sam3.py index f068c9f61a95..f4e85e152b5e 100644 --- a/src/transformers/models/sam3/modeling_sam3.py +++ b/src/transformers/models/sam3/modeling_sam3.py @@ -1031,7 +1031,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3VisionEncoderOutput: + ) -> Sam3VisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1399,7 +1399,7 @@ def forward( text_mask: torch.Tensor | None = None, spatial_sizes: list[tuple[int, int]] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3DETREncoderOutput: + ) -> Sam3DETREncoderOutput: """ Forward pass for the DETR encoder. @@ -1694,7 +1694,7 @@ def forward( text_mask: torch.Tensor | None = None, spatial_shapes: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3DETRDecoderOutput: + ) -> Sam3DETRDecoderOutput: """ Forward pass for the DETR decoder. @@ -2010,7 +2010,7 @@ def forward( prompt_features: torch.Tensor | None = None, prompt_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3MaskDecoderOutput: + ) -> Sam3MaskDecoderOutput: """ Args: decoder_queries: Decoder output queries [batch_size, num_queries, hidden_size] @@ -2147,7 +2147,7 @@ def get_text_features( input_ids: torch.LongTensor, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Example: diff --git a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py index 78674ca80427..929e15108019 100644 --- a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py +++ b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py @@ -1065,7 +1065,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3TrackerVisionEncoderOutput: + ) -> Sam3TrackerVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py index 6a07c21dbd18..80f6cc1fba4d 100644 --- a/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +++ b/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py @@ -1862,7 +1862,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3TrackerVideoVisionEncoderOutput: + ) -> Sam3TrackerVideoVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py index 1d357fe923b0..44e46de7f37c 100644 --- a/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +++ b/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py @@ -550,7 +550,7 @@ def get_image_features( self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Sam3TrackerVideoVisionEncoderOutput: + ) -> Sam3TrackerVideoVisionEncoderOutput: r""" pixel_values (`torch.FloatTensor`): Input pixel values of shape `(batch_size, num_channels, height, width)`. diff --git a/src/transformers/models/sam_hq/modeling_sam_hq.py b/src/transformers/models/sam_hq/modeling_sam_hq.py index 176d875a4000..19920b563a1a 100644 --- a/src/transformers/models/sam_hq/modeling_sam_hq.py +++ b/src/transformers/models/sam_hq/modeling_sam_hq.py @@ -556,7 +556,7 @@ def get_input_embeddings(self): @check_model_inputs(tie_last_hidden_states=False) def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | SamHQVisionEncoderOutput: + ) -> SamHQVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/sam_hq/modular_sam_hq.py b/src/transformers/models/sam_hq/modular_sam_hq.py index 481345905cd8..ba6486d7cf4a 100644 --- a/src/transformers/models/sam_hq/modular_sam_hq.py +++ b/src/transformers/models/sam_hq/modular_sam_hq.py @@ -192,7 +192,7 @@ class SamHQVisionEncoder(SamVisionEncoder, SamHQPreTrainedModel): @check_model_inputs(tie_last_hidden_states=False) def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | SamHQVisionEncoderOutput: + ) -> SamHQVisionEncoderOutput: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index 657d5e3e4554..9b3a245b7e93 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -791,7 +791,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -821,7 +821,7 @@ def get_image_features( pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py index 3182fac5ca8c..f050682b0195 100644 --- a/src/transformers/models/siglip2/modeling_siglip2.py +++ b/src/transformers/models/siglip2/modeling_siglip2.py @@ -849,7 +849,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -880,7 +880,7 @@ def get_image_features( pixel_attention_mask: torch.Tensor | None = None, spatial_shapes: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/siglip2/modular_siglip2.py b/src/transformers/models/siglip2/modular_siglip2.py index 9c022de838e9..0220ed131ae4 100644 --- a/src/transformers/models/siglip2/modular_siglip2.py +++ b/src/transformers/models/siglip2/modular_siglip2.py @@ -367,7 +367,7 @@ def get_image_features( pixel_attention_mask: torch.Tensor | None = None, spatial_shapes: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py index 745817caa8b6..23d24234cc0a 100644 --- a/src/transformers/models/smolvlm/modeling_smolvlm.py +++ b/src/transformers/models/smolvlm/modeling_smolvlm.py @@ -348,7 +348,7 @@ def forward( pixel_values, patch_attention_mask: torch.BoolTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: batch_size = pixel_values.size(0) if patch_attention_mask is None: patch_size = self.patch_size @@ -533,7 +533,7 @@ def get_image_features( pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -608,7 +608,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | SmolVLMBaseModelOutputWithPast: + ) -> SmolVLMBaseModelOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. @@ -778,7 +778,7 @@ def forward( return_dict: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SmolVLMCausalLMOutputWithPast: + ) -> SmolVLMCausalLMOutputWithPast: r""" pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): Mask to avoid performing attention on padding pixel indices. diff --git a/src/transformers/models/smolvlm/modular_smolvlm.py b/src/transformers/models/smolvlm/modular_smolvlm.py index 2298499b670c..e0c7a8b297c8 100644 --- a/src/transformers/models/smolvlm/modular_smolvlm.py +++ b/src/transformers/models/smolvlm/modular_smolvlm.py @@ -201,7 +201,7 @@ def get_image_features( pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -276,7 +276,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | SmolVLMBaseModelOutputWithPast: + ) -> SmolVLMBaseModelOutputWithPast: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 8e930c0b604e..2f7d6395c5d5 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -296,7 +296,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -374,7 +374,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 8613fef45695..100da24a3c1d 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -364,7 +364,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/starcoder2/modular_starcoder2.py b/src/transformers/models/starcoder2/modular_starcoder2.py index f6fd0841b217..478593ecb60d 100644 --- a/src/transformers/models/starcoder2/modular_starcoder2.py +++ b/src/transformers/models/starcoder2/modular_starcoder2.py @@ -153,7 +153,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index cb30a42ac764..9745fd8270b1 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -691,7 +691,7 @@ def forward( use_cache=None, cache_position=None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoEModelOutputWithPastAndCrossAttentions: + ) -> MoEModelOutputWithPastAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -949,7 +949,7 @@ def forward( decoder_inputs_embeds: torch.Tensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqMoEModelOutput: + ) -> Seq2SeqMoEModelOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs @@ -1104,7 +1104,7 @@ def forward( output_router_logits: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqMoEOutput: + ) -> Seq2SeqMoEOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs @@ -1242,7 +1242,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | MoEModelOutput: + ) -> MoEModelOutput: use_cache = False encoder_outputs = self.encoder( input_ids=input_ids, diff --git a/src/transformers/models/switch_transformers/modular_switch_transformers.py b/src/transformers/models/switch_transformers/modular_switch_transformers.py index 64f91cc20c39..6d0a9f31a81a 100644 --- a/src/transformers/models/switch_transformers/modular_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modular_switch_transformers.py @@ -447,7 +447,7 @@ def forward( use_cache=None, cache_position=None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MoEModelOutputWithPastAndCrossAttentions: + ) -> MoEModelOutputWithPastAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -705,7 +705,7 @@ def forward( decoder_inputs_embeds: torch.Tensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqMoEModelOutput: + ) -> Seq2SeqMoEModelOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs @@ -795,7 +795,7 @@ def forward( output_router_logits: bool | None = False, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqMoEOutput: + ) -> Seq2SeqMoEOutput: if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs @@ -933,7 +933,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | MoEModelOutput: + ) -> MoEModelOutput: use_cache = False encoder_outputs = self.encoder( input_ids=input_ids, diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index f6bc2ca10eae..0874f476e9ad 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -691,7 +691,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -790,7 +790,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") if encoder_hidden_states is None: @@ -1031,7 +1031,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index c55d20ba7b66..bbfe9542737e 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -701,7 +701,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -800,7 +800,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPastAndCrossAttentions: + ) -> BaseModelOutputWithPastAndCrossAttentions: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") if encoder_hidden_states is None: @@ -1041,7 +1041,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, diff --git a/src/transformers/models/t5gemma2/modeling_t5gemma2.py b/src/transformers/models/t5gemma2/modeling_t5gemma2.py index da8bdc4905bc..d6812713b61a 100644 --- a/src/transformers/models/t5gemma2/modeling_t5gemma2.py +++ b/src/transformers/models/t5gemma2/modeling_t5gemma2.py @@ -811,7 +811,7 @@ def __init__( @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: # pixel_values: (batch_size, channels, height, width) # image_features: Image feature tensor of shape (num_images, image_length, embed_dim). vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) @@ -1256,7 +1256,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, diff --git a/src/transformers/models/t5gemma2/modular_t5gemma2.py b/src/transformers/models/t5gemma2/modular_t5gemma2.py index 3245a041d20b..9e7d8df54e31 100644 --- a/src/transformers/models/t5gemma2/modular_t5gemma2.py +++ b/src/transformers/models/t5gemma2/modular_t5gemma2.py @@ -863,7 +863,7 @@ def __init__( @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: # pixel_values: (batch_size, channels, height, width) # image_features: Image feature tensor of shape (num_images, image_length, embed_dim). vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs) @@ -1295,7 +1295,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput: + ) -> Seq2SeqLMOutput: r""" decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, diff --git a/src/transformers/models/video_llama_3/modeling_video_llama_3.py b/src/transformers/models/video_llama_3/modeling_video_llama_3.py index d19162af17a3..917487d2335e 100644 --- a/src/transformers/models/video_llama_3/modeling_video_llama_3.py +++ b/src/transformers/models/video_llama_3/modeling_video_llama_3.py @@ -351,7 +351,7 @@ def forward( cu_seqlens: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" cu_seqlens (`torch.Tensor` of shape `(num_images_or_videos + 1,)`): The cumulative sequence lengths of each image or video feature. @@ -444,7 +444,7 @@ def forward( grid_thw: torch.Tensor, merge_sizes: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" grid_thw (`torch.LongTensor` of shape `(num_images_or_videos, 3)`): The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values. @@ -554,7 +554,7 @@ def get_video_features( video_grid_thw: torch.LongTensor, video_merge_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -578,7 +578,7 @@ def get_image_features( image_grid_thw: torch.LongTensor, image_merge_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -662,7 +662,7 @@ def forward( video_compression_mask: torch.BoolTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlama3ModelOutputWithPast: + ) -> VideoLlama3ModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -829,7 +829,7 @@ def forward( video_compression_mask: torch.BoolTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlama3CausalLMOutputWithPast: + ) -> VideoLlama3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/video_llama_3/modular_video_llama_3.py b/src/transformers/models/video_llama_3/modular_video_llama_3.py index caeb8483afdd..6bd2ad57568b 100644 --- a/src/transformers/models/video_llama_3/modular_video_llama_3.py +++ b/src/transformers/models/video_llama_3/modular_video_llama_3.py @@ -416,7 +416,7 @@ def forward( cu_seqlens: torch.Tensor, position_embeddings: tuple[torch.Tensor, torch.Tensor], **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" cu_seqlens (`torch.Tensor` of shape `(num_images_or_videos + 1,)`): The cumulative sequence lengths of each image or video feature. @@ -499,7 +499,7 @@ def forward( grid_thw: torch.Tensor, merge_sizes: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" grid_thw (`torch.LongTensor` of shape `(num_images_or_videos, 3)`): The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values. @@ -602,7 +602,7 @@ def get_video_features( video_grid_thw: torch.LongTensor, video_merge_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -626,7 +626,7 @@ def get_image_features( image_grid_thw: torch.LongTensor, image_merge_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -669,7 +669,7 @@ def forward( video_compression_mask: torch.BoolTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlama3ModelOutputWithPast: + ) -> VideoLlama3ModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -793,7 +793,7 @@ def forward( video_compression_mask: torch.BoolTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlama3CausalLMOutputWithPast: + ) -> VideoLlama3CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index fe86cbb2512f..053bd296e09b 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -181,7 +181,7 @@ def get_image_features( vision_feature_select_strategy: str | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_images (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. @@ -228,7 +228,7 @@ def get_video_features( vision_feature_layer: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`) The tensors corresponding to the input videos. @@ -320,7 +320,7 @@ def forward( return_dict: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | VideoLlavaModelOutputWithPast: + ) -> VideoLlavaModelOutputWithPast: r""" pixel_values_images (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images. Pixel values can be obtained using @@ -462,7 +462,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | VideoLlavaCausalLMOutputWithPast: + ) -> VideoLlavaCausalLMOutputWithPast: r""" pixel_values_images (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): The tensors corresponding to the input images. Pixel values can be obtained using diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index a5d287c94b31..199e2060ddd5 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -160,7 +160,7 @@ def get_image_features( vision_feature_layers: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -351,7 +351,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | VipLlavaCausalLMOutputWithPast: + ) -> VipLlavaCausalLMOutputWithPast: r""" vision_feature_layers (`Union[int, list[int]]`, *optional*): The vision feature layer, or the list of indexes of the layers to select diff --git a/src/transformers/models/vipllava/modular_vipllava.py b/src/transformers/models/vipllava/modular_vipllava.py index 3b9a2dfdf284..c25d77bb8a05 100644 --- a/src/transformers/models/vipllava/modular_vipllava.py +++ b/src/transformers/models/vipllava/modular_vipllava.py @@ -81,7 +81,7 @@ def get_image_features( vision_feature_layers: int | list[int] | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`): The tensors corresponding to the input images. @@ -218,7 +218,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **lm_kwargs, - ) -> tuple | VipLlavaCausalLMOutputWithPast: + ) -> VipLlavaCausalLMOutputWithPast: r""" vision_feature_layers (`Union[int, list[int]]`, *optional*): The vision feature layer, or the list of indexes of the layers to select diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index 77aa6feb384a..9f97373a907e 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -111,7 +111,7 @@ def get_text_features( position_ids: torch.Tensor | None = None, token_type_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -143,7 +143,7 @@ def get_text_features( @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py index b5da48a61e4c..96c4d9786c53 100644 --- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py @@ -397,7 +397,7 @@ def forward( dataset_index: torch.Tensor | None = None, output_hidden_states: bool | None = None, **kwargs, - ): + ) -> BackboneOutput: r""" dataset_index (`torch.Tensor` of shape `(batch_size,)`): Index to use in the Mixture-of-Experts (MoE) blocks of the backbone. diff --git a/src/transformers/models/vjepa2/modeling_vjepa2.py b/src/transformers/models/vjepa2/modeling_vjepa2.py index 332e02108056..40df7132087c 100644 --- a/src/transformers/models/vjepa2/modeling_vjepa2.py +++ b/src/transformers/models/vjepa2/modeling_vjepa2.py @@ -1087,7 +1087,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, **kwargs, - ) -> tuple | ImageClassifierOutput: + ) -> ImageClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., diff --git a/src/transformers/models/voxtral/modeling_voxtral.py b/src/transformers/models/voxtral/modeling_voxtral.py index 594ca09efe55..40d07ae24677 100644 --- a/src/transformers/models/voxtral/modeling_voxtral.py +++ b/src/transformers/models/voxtral/modeling_voxtral.py @@ -296,7 +296,7 @@ def forward( input_features, attention_mask=None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Args: input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): @@ -403,7 +403,7 @@ def get_decoder(self): ) def get_audio_features( self, input_features: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/voxtral/modular_voxtral.py b/src/transformers/models/voxtral/modular_voxtral.py index 7fddbcd29648..d1990cc2f1bb 100644 --- a/src/transformers/models/voxtral/modular_voxtral.py +++ b/src/transformers/models/voxtral/modular_voxtral.py @@ -71,7 +71,7 @@ def forward( input_features, attention_mask=None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Args: input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): @@ -169,7 +169,7 @@ def get_decoder(self): ) def get_audio_features( self, input_features: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index f08c4a695e2b..109c51912612 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -582,7 +582,7 @@ def forward( output_attentions: bool | None = None, output_hidden_states: bool | None = None, return_dict: bool | None = None, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -1211,7 +1211,7 @@ def get_text_features( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1244,7 +1244,7 @@ def get_video_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index e8006b07e22d..96060f7afc22 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -617,7 +617,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -760,7 +760,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -869,7 +869,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -965,7 +965,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1049,7 +1049,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1150,7 +1150,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -1218,7 +1218,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/xlm_roberta/modular_xlm_roberta.py b/src/transformers/models/xlm_roberta/modular_xlm_roberta.py index f9404c0d7993..024e0f376a79 100644 --- a/src/transformers/models/xlm_roberta/modular_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modular_xlm_roberta.py @@ -83,7 +83,7 @@ def forward( use_cache: bool | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -177,7 +177,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -246,7 +246,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -326,7 +326,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -419,7 +419,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: @@ -483,7 +483,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`: diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index c765000ca274..d83ccf713fc2 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -613,7 +613,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: @@ -802,7 +802,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -901,7 +901,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -963,7 +963,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1035,7 +1035,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -1126,7 +1126,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1191,7 +1191,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.roberta( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py index df51342415e9..208db351d138 100644 --- a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py @@ -303,7 +303,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -402,7 +402,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MaskedLMOutput: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -464,7 +464,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -536,7 +536,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See @@ -627,7 +627,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -692,7 +692,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: outputs = self.roberta( input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/xlstm/modeling_xlstm.py b/src/transformers/models/xlstm/modeling_xlstm.py index 33f263f7f54b..d428e0f203ba 100644 --- a/src/transformers/models/xlstm/modeling_xlstm.py +++ b/src/transformers/models/xlstm/modeling_xlstm.py @@ -1413,7 +1413,7 @@ def forward( use_cache: bool | None = None, output_hidden_states: bool | None = None, **kwargs, - ) -> tuple | xLSTMOutput: + ) -> xLSTMOutput: r""" cache_params (`xLSTMCache`, *optional*): The xLSTMCache that carries the RNN states. @@ -1581,7 +1581,7 @@ def forward( use_cache: bool | None = None, output_hidden_states: bool | None = None, **kwargs, - ) -> tuple | xLSTMCausalLMOutput: + ) -> xLSTMCausalLMOutput: r""" cache_params (`xLSTMCache`, *optional*): The xLSTMCache that carries the RNN states. diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index c59bbc4bb022..38e60923a586 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -719,7 +719,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -885,7 +885,7 @@ def forward( cache_position: torch.Tensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions: + ) -> CausalLMOutputWithCrossAttentions: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -995,7 +995,7 @@ def forward( encoder_attention_mask: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MaskedLMOutput: + ) -> MaskedLMOutput: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -1087,7 +1087,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | SequenceClassifierOutput: + ) -> SequenceClassifierOutput: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -1166,7 +1166,7 @@ def forward( position_ids: torch.LongTensor | None = None, inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput: + ) -> MultipleChoiceModelOutput: r""" input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`): Indices of input sequence tokens in the vocabulary. @@ -1271,7 +1271,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | TokenClassifierOutput: + ) -> TokenClassifierOutput: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index @@ -1357,7 +1357,7 @@ def forward( start_positions: torch.LongTensor | None = None, end_positions: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput: + ) -> QuestionAnsweringModelOutput: r""" lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices of the language adapters that should be activated for each sample, respectively. Default: the index From abaca9c34049cb47a317066a1e5d28dcf16398ec Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 13:39:01 +0100 Subject: [PATCH 04/20] Fix blt by removing check_model_inputs; matching other classes This is the main actual change, the rest is just typings --- src/transformers/models/blt/modeling_blt.py | 5 ++--- src/transformers/models/blt/modular_blt.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/blt/modeling_blt.py b/src/transformers/models/blt/modeling_blt.py index 0dea1ef44a67..cbb0a5484e68 100644 --- a/src/transformers/models/blt/modeling_blt.py +++ b/src/transformers/models/blt/modeling_blt.py @@ -742,7 +742,6 @@ def __init__(self, config: BltLocalDecoderConfig): self.post_init() - @check_model_inputs def forward( self, input_ids: torch.LongTensor | None = None, @@ -1226,7 +1225,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1403,7 +1402,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" cross_attention_states (`torch.FloatTensor`, *optional*): Output of the vision model, used for cross-attention. This tensor contains the processed image features that diff --git a/src/transformers/models/blt/modular_blt.py b/src/transformers/models/blt/modular_blt.py index e27a24ee59d7..86507d750dc0 100644 --- a/src/transformers/models/blt/modular_blt.py +++ b/src/transformers/models/blt/modular_blt.py @@ -673,7 +673,6 @@ def __init__(self, config: BltLocalDecoderConfig): self.post_init() - @check_model_inputs def forward( self, input_ids: torch.LongTensor | None = None, @@ -950,7 +949,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") @@ -1118,7 +1117,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: # Call parent forward but exclude cross_attention_states from model call outputs = self.model( input_ids=input_ids, From 3652612d58c7e65fed9ab07e4aa04352f2d81472 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 13:46:47 +0100 Subject: [PATCH 05/20] Use can_return_tuple on altclip And remove return_dict from altclip and clap: the can_return_tuple should take care of it fully --- src/transformers/models/altclip/modeling_altclip.py | 5 ++--- src/transformers/models/clap/modeling_clap.py | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 62ffd1cc8154..9b6d63ac8298 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -964,6 +964,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + @can_return_tuple @auto_docstring # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward def forward( @@ -975,14 +976,12 @@ def forward( inputs_embeds: torch.Tensor | None = None, output_attentions: bool | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, - ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index b799bb92c75e..37bad5056c97 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1456,14 +1456,12 @@ def forward( inputs_embeds: torch.Tensor | None = None, output_attentions: bool | None = None, output_hidden_states: bool | None = None, - return_dict: bool | None = None, **kwargs, ) -> BaseModelOutputWithPoolingAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") From 9d9666a099650eeb13f64a39a70ce492c3cecba2 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 13:54:55 +0100 Subject: [PATCH 06/20] Add docstring for check_model_inputs / can_return_tuple --- src/transformers/utils/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index b459091afb68..62a791b6dd56 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -827,6 +827,9 @@ def can_return_tuple(func: Callable[P, T]) -> Callable[P, tuple | T]: Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or use_return_dict=False is set in the config. + The wrapped method or function should not be typed like `tuple | X`, but instead just `X`, where `X` is the + original return type. This decorator's typing ensures that the return type is correctly represented as `tuple | X`. + Note: output.to_tuple() convert output to tuple skipping all `None` values. """ @@ -872,6 +875,9 @@ def check_model_inputs( Decorator to intercept specific layer outputs without using hooks. Compatible with torch.compile (Dynamo tracing). + The wrapped method or function should not be typed like `tuple | X`, but instead just `X`, where `X` is the + original return type. This decorator's typing ensures that the return type is correctly represented as `tuple | X`. + Args: tie_last_hidden_states (`bool`, *optional*, defaults to `True`): Whether to overwrite `out.hidden_states[-1]` with the `out.last_hidden_state`. From 99d5deb59bf238d8a8bb7fad2e429dc36d8ea610 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 14:06:49 +0100 Subject: [PATCH 07/20] Import ParamSpec via generic as we're in Python 3.10+ --- src/transformers/utils/generic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index 62a791b6dd56..28d7c72e1975 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -25,10 +25,9 @@ from dataclasses import dataclass, fields, is_dataclass from enum import Enum from functools import partial, wraps -from typing import Any, Optional, TypedDict, TypeVar +from typing import Any, Optional, ParamSpec, TypedDict, TypeVar import numpy as np -from typing_extensions import ParamSpec from ..utils import logging from .import_utils import is_mlx_available, is_torch_available, is_torch_fx_proxy, requires From 6ad9269daa6470fb9bc24513961ccc833f1ea517 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 14:18:35 +0100 Subject: [PATCH 08/20] Add check_decorator_return_types to fix-repo & check-repo --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index d2b1fa8ccb7f..e3681a5044bd 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,7 @@ check-repo: -python utils/check_config_docstrings.py -python utils/check_config_attributes.py -python utils/check_doctest_list.py + -python utils/check_decorator_return_types.py -python utils/update_metadata.py --check-only -python utils/add_dates.py --check-only -@{ \ @@ -56,6 +57,7 @@ fix-repo: style -python utils/check_pipeline_typing.py --fix_and_overwrite -python utils/check_doctest_list.py --fix_and_overwrite -python utils/check_docstrings.py --fix_and_overwrite + -python utils/check_decorator_return_types.py --fix_and_overwrite -python utils/add_dates.py From a9afbdb2bc83ede5781438e67234b4045c047a17 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:35:08 +0100 Subject: [PATCH 09/20] Remove some dead code --- utils/check_decorator_return_types.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py index 2c6cbb3d5ac0..cb96a3e01a03 100644 --- a/utils/check_decorator_return_types.py +++ b/utils/check_decorator_return_types.py @@ -196,7 +196,7 @@ def _is_delegating_to_super(func_node: ast.AST) -> bool: return False -def _collect_decorated_functions(tree: ast.AST, file_path: str) -> list[tuple[ast.AST, str]]: +def _collect_decorated_functions(tree: ast.AST) -> list[tuple[ast.AST, str]]: """Return (function_node, decorator_name) pairs for targeted decorators.""" functions: list[tuple[ast.AST, str]] = [] @@ -254,7 +254,7 @@ def check_decorator_return_types(overwrite: bool = False): print(f"Skipping {file_path} due to SyntaxError: {e}") continue - functions = _collect_decorated_functions(tree, file_path) + functions = _collect_decorated_functions(tree) if not functions: continue From aa97fc675a29859b43d8e856b39ffdc5715580a7 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:38:13 +0100 Subject: [PATCH 10/20] Update _decorator_name to not check for our targets yet --- utils/check_decorator_return_types.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py index cb96a3e01a03..e166ae6a441b 100644 --- a/utils/check_decorator_return_types.py +++ b/utils/check_decorator_return_types.py @@ -68,7 +68,7 @@ def _iter_python_files(root: str) -> Iterable[str]: def _decorator_name(node: ast.expr) -> str | None: - """Return the simple name of a decorator, if it matches a target. + """Return the simple name of a decorator Handles forms like: - @can_return_tuple @@ -77,19 +77,12 @@ def _decorator_name(node: ast.expr) -> str | None: - @utils.check_model_inputs(...) """ - target = node - if isinstance(target, ast.Call): - target = target.func + target = node.func if isinstance(node, ast.Call) else node if isinstance(target, ast.Name): - name = target.id + return target.id elif isinstance(target, ast.Attribute): - name = target.attr - else: - return None - - if name in TARGET_DECORATORS: - return name + return target.attr return None @@ -207,7 +200,7 @@ def _collect_decorated_functions(tree: ast.AST) -> list[tuple[ast.AST, str]]: continue for deco in node.decorator_list: name = _decorator_name(deco) - if name is not None: + if name in TARGET_DECORATORS: functions.append((node, name)) break return functions From f71a072f874806ec9c558b5ddce45ece00c949da Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:41:56 +0100 Subject: [PATCH 11/20] Use src/transformers/models path instead --- utils/check_decorator_return_types.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py index e166ae6a441b..37a37a8f1519 100644 --- a/utils/check_decorator_return_types.py +++ b/utils/check_decorator_return_types.py @@ -39,7 +39,7 @@ from dataclasses import dataclass -PATH_TO_TRANSFORMERS = "src/transformers" +PATH_TO_TRANSFORMERS = "src/transformers/models" TARGET_DECORATORS = {"can_return_tuple", "check_model_inputs"} @@ -255,8 +255,8 @@ def check_decorator_return_types(overwrite: bool = False): for func_node, decorator_name in functions: # Ignore trivial delegations like `return super(...` or `super(...`. - if _is_delegating_to_super(func_node): - continue + # if _is_delegating_to_super(func_node): + # continue returns = func_node.returns From 63155b62a72ec998ee188ab178996293c5e5a81d Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:42:36 +0100 Subject: [PATCH 12/20] Revert "Use src/transformers/models path instead" This reverts commit f71a072f874806ec9c558b5ddce45ece00c949da. --- utils/check_decorator_return_types.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py index 37a37a8f1519..e166ae6a441b 100644 --- a/utils/check_decorator_return_types.py +++ b/utils/check_decorator_return_types.py @@ -39,7 +39,7 @@ from dataclasses import dataclass -PATH_TO_TRANSFORMERS = "src/transformers/models" +PATH_TO_TRANSFORMERS = "src/transformers" TARGET_DECORATORS = {"can_return_tuple", "check_model_inputs"} @@ -255,8 +255,8 @@ def check_decorator_return_types(overwrite: bool = False): for func_node, decorator_name in functions: # Ignore trivial delegations like `return super(...` or `super(...`. - # if _is_delegating_to_super(func_node): - # continue + if _is_delegating_to_super(func_node): + continue returns = func_node.returns From 864a6a7e0559fff9429c0c1be861f05fe95064f1 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:43:22 +0100 Subject: [PATCH 13/20] Use src/transformers/models path instead Without extra code; this time --- utils/check_decorator_return_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py index e166ae6a441b..236b73917c69 100644 --- a/utils/check_decorator_return_types.py +++ b/utils/check_decorator_return_types.py @@ -39,7 +39,7 @@ from dataclasses import dataclass -PATH_TO_TRANSFORMERS = "src/transformers" +PATH_TO_TRANSFORMERS = "src/transformers/models" TARGET_DECORATORS = {"can_return_tuple", "check_model_inputs"} From 456ee32a96a590486d16d8a7fd842cabd9ac5a5c Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:46:32 +0100 Subject: [PATCH 14/20] Simplify _is_none_annotation --- utils/check_decorator_return_types.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py index 236b73917c69..d64d958e385f 100644 --- a/utils/check_decorator_return_types.py +++ b/utils/check_decorator_return_types.py @@ -87,18 +87,11 @@ def _decorator_name(node: ast.expr) -> str | None: def _is_none_annotation(returns: ast.expr | None) -> bool: - if returns is None: - return True - - # -> None - if isinstance(returns, ast.Constant) and returns.value is None: - return True - - # -> None (as a name) - if isinstance(returns, ast.Name) and returns.id == "None": - return True - - return False + return ( + returns is None + or isinstance(returns, ast.Constant) and returns.value is None + or isinstance(returns, ast.Name) and returns.id == "None" + ) def _is_tuple_type(node: ast.AST) -> bool: From 3b0808e1498889d9e3f7ad8ae1624a13fa6bb992 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:47:48 +0100 Subject: [PATCH 15/20] Explain why super() is skipped --- utils/check_decorator_return_types.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py index d64d958e385f..4b75f2dd8058 100644 --- a/utils/check_decorator_return_types.py +++ b/utils/check_decorator_return_types.py @@ -248,6 +248,9 @@ def check_decorator_return_types(overwrite: bool = False): for func_node, decorator_name in functions: # Ignore trivial delegations like `return super(...` or `super(...`. + # We skip these as this happens sometimes in modular files with methods that inherit their return + # type from another architecture. Then they'll have no explicit return type, but we'll test via + # the generated modeling file instead. if _is_delegating_to_super(func_node): continue From dbd43d60a1ddd6ba900b6c63ce99f33ad6f7d2dc Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:49:32 +0100 Subject: [PATCH 16/20] Add comments for ParamSpec/TypeVar linking to Python docs --- src/transformers/utils/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index 28d7c72e1975..1857f71263f2 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -33,6 +33,9 @@ from .import_utils import is_mlx_available, is_torch_available, is_torch_fx_proxy, requires +# See https://docs.python.org/3/library/typing.html#typing.ParamSpec for documentation on how ParamSpec +# should be used alongside TypeVar and ParamSpecArgs (P.args) and ParamSpecKwargs (P.kwargs) for type +# hinting decorators. P = ParamSpec("P") T = TypeVar("T") From 688be4bc9aa2485c879aaf59611d4aad4366e930 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:49:47 +0100 Subject: [PATCH 17/20] Make style --- utils/check_decorator_return_types.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py index 4b75f2dd8058..ca1e0af52dcd 100644 --- a/utils/check_decorator_return_types.py +++ b/utils/check_decorator_return_types.py @@ -89,8 +89,10 @@ def _decorator_name(node: ast.expr) -> str | None: def _is_none_annotation(returns: ast.expr | None) -> bool: return ( returns is None - or isinstance(returns, ast.Constant) and returns.value is None - or isinstance(returns, ast.Name) and returns.id == "None" + or isinstance(returns, ast.Constant) + and returns.value is None + or isinstance(returns, ast.Name) + and returns.id == "None" ) From 3fd9571c96449256980f52d29d7883b1cd1eebef Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 23 Jan 2026 15:59:12 +0100 Subject: [PATCH 18/20] Move decorator typing comments around --- src/transformers/utils/generic.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index 1857f71263f2..05c4615ce4c0 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -33,9 +33,7 @@ from .import_utils import is_mlx_available, is_torch_available, is_torch_fx_proxy, requires -# See https://docs.python.org/3/library/typing.html#typing.ParamSpec for documentation on how ParamSpec -# should be used alongside TypeVar and ParamSpecArgs (P.args) and ParamSpecKwargs (P.kwargs) for type -# hinting decorators. +# Used to type hint decorators that modify the signature of the decorated function P = ParamSpec("P") T = TypeVar("T") @@ -824,6 +822,8 @@ def del_attribute_from_modules(module: "torch.nn.Module", key: str): del_attribute_from_modules(submodule, key) +# We follow the example from https://docs.python.org/3/library/typing.html#typing.ParamSpec to type-hint +# this decorator, allowing it to add 'tuple' to the signature of the decorated function. def can_return_tuple(func: Callable[P, T]) -> Callable[P, tuple | T]: """ Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or @@ -870,6 +870,8 @@ class OutputRecorder: class_name: str | None = None +# We follow the example from https://docs.python.org/3/library/typing.html#typing.ParamSpec to type-hint +# this decorator, allowing it to add 'tuple' to the signature of the decorated function. def check_model_inputs( func: Callable[P, T] | None = None, *, tie_last_hidden_states: bool = True ) -> Callable[P, tuple | T]: From bc81a600c595f4f42dc60e4f65dd1bf0e0762901 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 20 Feb 2026 10:39:26 +0100 Subject: [PATCH 19/20] Rerun utils/check_decorator_return_types.py --- src/transformers/models/align/modeling_align.py | 8 ++++---- src/transformers/models/altclip/modeling_altclip.py | 6 +++--- src/transformers/models/blt/modular_blt.py | 2 +- .../models/chinese_clip/modeling_chinese_clip.py | 2 +- src/transformers/models/clap/modeling_clap.py | 2 +- src/transformers/models/clipseg/modeling_clipseg.py | 2 +- .../conditional_detr/modeling_conditional_detr.py | 2 +- src/transformers/models/d_fine/modeling_d_fine.py | 4 ++-- .../models/deformable_detr/modeling_deformable_detr.py | 6 +++--- .../models/deformable_detr/modular_deformable_detr.py | 8 ++++---- src/transformers/models/detr/modeling_detr.py | 6 +++--- .../models/exaone_moe/modeling_exaone_moe.py | 2 +- .../models/florence2/modeling_florence2.py | 4 +--- src/transformers/models/florence2/modular_florence2.py | 4 +--- src/transformers/models/gemma3n/modeling_gemma3n.py | 2 +- src/transformers/models/gemma3n/modular_gemma3n.py | 2 +- src/transformers/models/git/modeling_git.py | 2 +- .../models/granite_speech/modeling_granite_speech.py | 4 +--- .../models/higgs_audio_v2/modeling_higgs_audio_v2.py | 2 +- .../models/higgs_audio_v2/modular_higgs_audio_v2.py | 2 +- src/transformers/models/idefics/vision.py | 2 +- src/transformers/models/kosmos2/modeling_kosmos2.py | 2 +- src/transformers/models/layoutlm/modeling_layoutlm.py | 2 +- src/transformers/models/lw_detr/modeling_lw_detr.py | 2 +- src/transformers/models/lw_detr/modular_lw_detr.py | 2 +- src/transformers/models/markuplm/modeling_markuplm.py | 2 +- src/transformers/models/mlcd/modeling_mlcd.py | 2 +- .../modeling_moonshine_streaming.py | 2 +- .../moonshine_streaming/modular_moonshine_streaming.py | 2 +- .../models/pp_doclayout_v3/modeling_pp_doclayout_v3.py | 6 +++--- .../models/pp_doclayout_v3/modular_pp_doclayout_v3.py | 6 +++--- src/transformers/models/qwen3_5/modeling_qwen3_5.py | 8 ++++---- src/transformers/models/qwen3_5/modular_qwen3_5.py | 2 +- .../models/qwen3_5_moe/modeling_qwen3_5_moe.py | 8 ++++---- .../models/qwen3_omni_moe/modeling_qwen3_omni_moe.py | 4 ++-- .../models/qwen3_vl_moe/modeling_qwen3_vl_moe.py | 2 +- .../models/qwen3_vl_moe/modular_qwen3_vl_moe.py | 2 +- src/transformers/models/rt_detr/modeling_rt_detr.py | 6 +++--- src/transformers/models/rt_detr/modular_rt_detr.py | 6 +++--- .../models/rt_detr_v2/modeling_rt_detr_v2.py | 6 +++--- src/transformers/models/splinter/modeling_splinter.py | 2 +- src/transformers/models/t5gemma2/modeling_t5gemma2.py | 2 +- src/transformers/models/t5gemma2/modular_t5gemma2.py | 2 +- .../modeling_vibevoice_acoustic_tokenizer.py | 10 +++++++--- .../modular_vibevoice_acoustic_tokenizer.py | 10 +++++++--- .../voxtral_realtime/modeling_voxtral_realtime.py | 4 ++-- .../voxtral_realtime/modular_voxtral_realtime.py | 4 ++-- src/transformers/models/x_clip/modeling_x_clip.py | 2 +- 48 files changed, 92 insertions(+), 90 deletions(-) diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 211ce7aa6d9e..962ad6de9989 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -771,7 +771,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -897,7 +897,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" Examples: @@ -1010,7 +1010,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPoolingAndNoAttention: + ) -> BaseModelOutputWithPoolingAndNoAttention: r""" Examples: @@ -1173,7 +1173,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | AlignOutput: + ) -> AlignOutput: r""" return_loss (`bool`, *optional*): Whether or not to return the contrastive loss. diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 9bb228715967..c8b1c1532bbc 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -384,7 +384,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -607,7 +607,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -1053,7 +1053,7 @@ def forward( return_dict: bool | None = None, output_hidden_states: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPoolingAndProjection: + ) -> BaseModelOutputWithPoolingAndProjection: r""" Examples: diff --git a/src/transformers/models/blt/modular_blt.py b/src/transformers/models/blt/modular_blt.py index 9232704341c2..539c99ef5670 100644 --- a/src/transformers/models/blt/modular_blt.py +++ b/src/transformers/models/blt/modular_blt.py @@ -1131,7 +1131,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | CausalLMOutputWithPast: + ) -> CausalLMOutputWithPast: r""" cross_attention_states (`torch.FloatTensor`, *optional*): Output of the vision model, used for cross-attention. This tensor contains the processed image features that diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index c2f60f03c14c..8c76be58c32c 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -631,7 +631,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 4f24a8f0aa99..f43091b83950 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1259,7 +1259,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index bf179063b61a..b29d033ea59e 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -489,7 +489,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 2de83de19c12..6bf377619d40 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -1674,7 +1674,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | ConditionalDetrSegmentationOutput: + ) -> ConditionalDetrSegmentationOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/d_fine/modeling_d_fine.py b/src/transformers/models/d_fine/modeling_d_fine.py index 1c758f8b1dcd..405d24cd8510 100644 --- a/src/transformers/models/d_fine/modeling_d_fine.py +++ b/src/transformers/models/d_fine/modeling_d_fine.py @@ -1627,7 +1627,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DFineModelOutput: + ) -> DFineModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1948,7 +1948,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DFineObjectDetectionOutput: + ) -> DFineObjectDetectionOutput: r""" Example: diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 3ee685a887c1..41fa7b40d950 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -1035,7 +1035,7 @@ def forward( level_start_index=None, valid_ratios=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> DeformableDetrDecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1312,7 +1312,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, decoder_inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DeformableDetrModelOutput: + ) -> DeformableDetrModelOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Not used by default. Can be used to mask object queries. @@ -1571,7 +1571,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DeformableDetrObjectDetectionOutput: + ) -> DeformableDetrObjectDetectionOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Not used by default. Can be used to mask object queries. diff --git a/src/transformers/models/deformable_detr/modular_deformable_detr.py b/src/transformers/models/deformable_detr/modular_deformable_detr.py index dfbc0783fb0a..213c5571edfd 100644 --- a/src/transformers/models/deformable_detr/modular_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modular_deformable_detr.py @@ -696,7 +696,7 @@ def forward( level_start_index=None, valid_ratios=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -817,7 +817,7 @@ def forward( level_start_index=None, valid_ratios=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> DeformableDetrDecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1094,7 +1094,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, decoder_inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DeformableDetrModelOutput: + ) -> DeformableDetrModelOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Not used by default. Can be used to mask object queries. @@ -1338,7 +1338,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DeformableDetrObjectDetectionOutput: + ) -> DeformableDetrObjectDetectionOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Not used by default. Can be used to mask object queries. diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 4906b3510f44..e29a17e3794b 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -1148,7 +1148,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, decoder_inputs_embeds: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DetrModelOutput: + ) -> DetrModelOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`: @@ -1327,7 +1327,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DetrObjectDetectionOutput: + ) -> DetrObjectDetectionOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`: @@ -1488,7 +1488,7 @@ def forward( decoder_inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | DetrSegmentationOutput: + ) -> DetrSegmentationOutput: r""" decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*): Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/exaone_moe/modeling_exaone_moe.py b/src/transformers/models/exaone_moe/modeling_exaone_moe.py index 35754ab51566..82c7a392a3ed 100644 --- a/src/transformers/models/exaone_moe/modeling_exaone_moe.py +++ b/src/transformers/models/exaone_moe/modeling_exaone_moe.py @@ -512,7 +512,7 @@ def forward( use_cache: bool | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: if (input_ids is None) ^ (inputs_embeds is not None): raise ValueError("You must specify exactly one of input_ids or inputs_embeds") diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py index d92873f1aadf..18f5d9a3681b 100644 --- a/src/transformers/models/florence2/modeling_florence2.py +++ b/src/transformers/models/florence2/modeling_florence2.py @@ -555,9 +555,7 @@ def __init__(self, config: Florence2VisionConfig): @merge_with_config_defaults @capture_outputs - def forward( - self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: for conv, block in zip(self.convs, self.blocks): hidden_states = conv(hidden_states) for layer in block: diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py index c5a826571b5a..18a504e1ba67 100644 --- a/src/transformers/models/florence2/modular_florence2.py +++ b/src/transformers/models/florence2/modular_florence2.py @@ -1406,9 +1406,7 @@ def __init__(self, config: Florence2VisionConfig): @merge_with_config_defaults @capture_outputs - def forward( - self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: for conv, block in zip(self.convs, self.blocks): hidden_states = conv(hidden_states) for layer in block: diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index ef4ea9602a7b..73889e0cadc6 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -1447,7 +1447,7 @@ def __init__(self, config: Gemma3nAudioConfig): @capture_outputs def forward( self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: """Encodes a batch of MELs. Args: diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 74c006133d99..b038e2005116 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -1898,7 +1898,7 @@ def __init__(self, config: Gemma3nAudioConfig): @capture_outputs def forward( self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | Gemma3nAudioEncoderModelOutput: + ) -> Gemma3nAudioEncoderModelOutput: """Encodes a batch of MELs. Args: diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 15a0395a1962..5a1a973e1c2f 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -770,7 +770,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py index 1aeae5bf1381..d7c7f1642deb 100644 --- a/src/transformers/models/granite_speech/modeling_granite_speech.py +++ b/src/transformers/models/granite_speech/modeling_granite_speech.py @@ -306,9 +306,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig): @merge_with_config_defaults @capture_outputs - def forward( - self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling: hidden_states = self.input_linear(hidden_states) for idx, layer in enumerate(self.layers, start=1): hidden_states = layer(hidden_states, attention_dists=self.attention_dists) diff --git a/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py index f760bc611f80..f1a06f4221a9 100644 --- a/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py +++ b/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py @@ -675,7 +675,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> CausalLMOutputWithPast: r""" audio_input_ids (`torch.LongTensor` of shape `(batch_size, num_audio_frames, num_codebooks)`, *optional*): Indices of audio codebook tokens. diff --git a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py index d7bbce5248b2..646693131039 100644 --- a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py +++ b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py @@ -557,7 +557,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ): + ) -> CausalLMOutputWithPast: r""" audio_input_ids (`torch.LongTensor` of shape `(batch_size, num_audio_frames, num_codebooks)`, *optional*): Indices of audio codebook tokens. diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py index 2cc51ac34993..1352519de55a 100644 --- a/src/transformers/models/idefics/vision.py +++ b/src/transformers/models/idefics/vision.py @@ -356,7 +356,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index e7073019d8a3..eccb957c0c02 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -448,7 +448,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs, - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 6f13f274f7d6..76fb26f1e640 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -326,7 +326,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None diff --git a/src/transformers/models/lw_detr/modeling_lw_detr.py b/src/transformers/models/lw_detr/modeling_lw_detr.py index 08aeb6bcea67..44f9ce107794 100644 --- a/src/transformers/models/lw_detr/modeling_lw_detr.py +++ b/src/transformers/models/lw_detr/modeling_lw_detr.py @@ -1133,7 +1133,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> LwDetrDecoderOutput: intermediate = () intermediate_reference_points = (reference_points,) diff --git a/src/transformers/models/lw_detr/modular_lw_detr.py b/src/transformers/models/lw_detr/modular_lw_detr.py index d783f8adcacd..d2b0a96088e1 100644 --- a/src/transformers/models/lw_detr/modular_lw_detr.py +++ b/src/transformers/models/lw_detr/modular_lw_detr.py @@ -1109,7 +1109,7 @@ def forward( encoder_hidden_states: torch.Tensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> LwDetrDecoderOutput: intermediate = () intermediate_reference_points = (reference_points,) diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index a89f526c7f71..8f1eafa11799 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -474,7 +474,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py index 1e8f526f6168..a1f90104154d 100644 --- a/src/transformers/models/mlcd/modeling_mlcd.py +++ b/src/transformers/models/mlcd/modeling_mlcd.py @@ -479,7 +479,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py b/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py index a58780b73252..123af47a918b 100644 --- a/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py +++ b/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py @@ -819,7 +819,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention diff --git a/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py b/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py index 2f1f6058316a..3f68f2622766 100644 --- a/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py +++ b/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py @@ -363,7 +363,7 @@ def forward( encoder_hidden_states: torch.FloatTensor | None = None, encoder_attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPast: + ) -> BaseModelOutputWithPast: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention diff --git a/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py b/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py index d4c275b93eed..17e8e6f34a0b 100644 --- a/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +++ b/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py @@ -1150,7 +1150,7 @@ def forward( norm=None, mask_feat=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> PPDocLayoutV3DecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1655,7 +1655,7 @@ def forward( encoder_outputs: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ModelOutput: + ) -> PPDocLayoutV3ModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1977,7 +1977,7 @@ def forward( encoder_outputs: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ForObjectDetectionOutput: + ) -> PPDocLayoutV3ForObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py b/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py index bd5fe3b76857..fbe05c3f1f91 100644 --- a/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +++ b/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py @@ -978,7 +978,7 @@ def forward( norm=None, mask_feat=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> PPDocLayoutV3DecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1116,7 +1116,7 @@ def forward( encoder_outputs: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ModelOutput: + ) -> PPDocLayoutV3ModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1435,7 +1435,7 @@ def forward( encoder_outputs: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ForObjectDetectionOutput: + ) -> PPDocLayoutV3ForObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/qwen3_5/modeling_qwen3_5.py b/src/transformers/models/qwen3_5/modeling_qwen3_5.py index 96efb5838ca4..baefd277b499 100644 --- a/src/transformers/models/qwen3_5/modeling_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modeling_qwen3_5.py @@ -1518,7 +1518,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1535,7 +1535,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1646,7 +1646,7 @@ def forward( video_grid_thw: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5ModelOutputWithPast: + ) -> Qwen3_5ModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -1887,7 +1887,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5CausalLMOutputWithPast: + ) -> Qwen3_5CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py index 2c3297386b91..e34cee3d41a3 100644 --- a/src/transformers/models/qwen3_5/modular_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py @@ -733,7 +733,7 @@ def forward( video_grid_thw: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5ModelOutputWithPast: + ) -> Qwen3_5ModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. diff --git a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py index 8fbccbd23db1..19cc78f900c9 100644 --- a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py +++ b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py @@ -1643,7 +1643,7 @@ def get_video_features( pixel_values_videos: torch.FloatTensor, video_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input videos. @@ -1660,7 +1660,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): The tensors corresponding to the input images. @@ -1771,7 +1771,7 @@ def forward( video_grid_thw: torch.LongTensor | None = None, cache_position: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5MoeModelOutputWithPast: + ) -> Qwen3_5MoeModelOutputWithPast: r""" image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): The temporal, height and width of feature shape of each image in LLM. @@ -2089,7 +2089,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3_5MoeCausalLMOutputWithPast: + ) -> Qwen3_5MoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index 89aa791e2684..9667257c49d5 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -1708,7 +1708,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. @@ -2968,7 +2968,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. diff --git a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index dee473790fef..ea7dab69095e 100644 --- a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -933,7 +933,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions. diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index b64dcb69a827..40fe39991171 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -460,7 +460,7 @@ def forward( cache_position: torch.LongTensor | None = None, logits_to_keep: int | torch.Tensor = 0, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | Qwen3VLMoeCausalLMOutputWithPast: + ) -> Qwen3VLMoeCausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py index 182d4b2c054a..abd8f0bf463d 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr.py @@ -1165,7 +1165,7 @@ def forward( spatial_shapes_list=None, level_start_index=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> RTDetrDecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1490,7 +1490,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrModelOutput: + ) -> RTDetrModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1715,7 +1715,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrObjectDetectionOutput: + ) -> RTDetrObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py index f9289f9e6619..7d9cffa1b6b2 100644 --- a/src/transformers/models/rt_detr/modular_rt_detr.py +++ b/src/transformers/models/rt_detr/modular_rt_detr.py @@ -1274,7 +1274,7 @@ def forward( spatial_shapes_list=None, level_start_index=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> RTDetrDecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1476,7 +1476,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrModelOutput: + ) -> RTDetrModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1701,7 +1701,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrObjectDetectionOutput: + ) -> RTDetrObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py index b5244ffda7f8..13fd3c87dbf1 100644 --- a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py @@ -588,7 +588,7 @@ def forward( spatial_shapes_list=None, level_start_index=None, **kwargs: Unpack[TransformersKwargs], - ): + ) -> RTDetrV2DecoderOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): @@ -1413,7 +1413,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrV2ModelOutput: + ) -> RTDetrV2ModelOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you @@ -1747,7 +1747,7 @@ def forward( inputs_embeds: torch.FloatTensor | None = None, labels: list[dict] | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.FloatTensor] | RTDetrV2ObjectDetectionOutput: + ) -> RTDetrV2ObjectDetectionOutput: r""" inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 3825a8bfb99e..b8908e9035c7 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -297,7 +297,7 @@ def forward( output_hidden_states: bool | None = False, return_dict: bool | None = True, **kwargs: Unpack[TransformersKwargs], - ) -> tuple[torch.Tensor] | BaseModelOutput: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None diff --git a/src/transformers/models/t5gemma2/modeling_t5gemma2.py b/src/transformers/models/t5gemma2/modeling_t5gemma2.py index 54a6bef97262..3c255decd994 100644 --- a/src/transformers/models/t5gemma2/modeling_t5gemma2.py +++ b/src/transformers/models/t5gemma2/modeling_t5gemma2.py @@ -1256,7 +1256,7 @@ def get_decoder(self): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: return self.get_encoder().get_image_features(pixel_values, **kwargs) @property diff --git a/src/transformers/models/t5gemma2/modular_t5gemma2.py b/src/transformers/models/t5gemma2/modular_t5gemma2.py index 1523a164a598..68b84b524288 100644 --- a/src/transformers/models/t5gemma2/modular_t5gemma2.py +++ b/src/transformers/models/t5gemma2/modular_t5gemma2.py @@ -1274,7 +1274,7 @@ def get_decoder(self): @auto_docstring def get_image_features( self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs] - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: return self.get_encoder().get_image_features(pixel_values, **kwargs) @property diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py index a10fffa4c029..a957736be547 100644 --- a/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py +++ b/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py @@ -529,7 +529,9 @@ def __init__(self, config): @can_return_tuple @auto_docstring - def encode(self, input_values, padding_cache=None, use_cache=None, sample=True): + def encode( + self, input_values, padding_cache=None, use_cache=None, sample=True + ) -> VibeVoiceAcousticTokenizerEncoderOutput: r""" input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input audio waveform to be encoded into latent representation. @@ -569,7 +571,7 @@ def encode(self, input_values, padding_cache=None, use_cache=None, sample=True): @can_return_tuple @auto_docstring - def decode(self, latents, padding_cache=None, use_cache=False): + def decode(self, latents, padding_cache=None, use_cache=False) -> VibeVoiceAcousticTokenizerDecoderOutput: r""" latents (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input latent representation to be decoded back into audio. @@ -604,7 +606,9 @@ def decode(self, latents, padding_cache=None, use_cache=False): @can_return_tuple @auto_docstring - def forward(self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs): + def forward( + self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs + ) -> VibeVoiceAcousticTokenizerOutput: r""" input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input audio waveform to be encoded into latent representation. diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py index 31ee1a2919bd..a7bab1326fb0 100644 --- a/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py +++ b/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py @@ -417,7 +417,9 @@ def __init__(self, config): @can_return_tuple @auto_docstring - def encode(self, input_values, padding_cache=None, use_cache=None, sample=True): + def encode( + self, input_values, padding_cache=None, use_cache=None, sample=True + ) -> VibeVoiceAcousticTokenizerEncoderOutput: r""" input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input audio waveform to be encoded into latent representation. @@ -457,7 +459,7 @@ def encode(self, input_values, padding_cache=None, use_cache=None, sample=True): @can_return_tuple @auto_docstring - def decode(self, latents, padding_cache=None, use_cache=False): + def decode(self, latents, padding_cache=None, use_cache=False) -> VibeVoiceAcousticTokenizerDecoderOutput: r""" latents (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input latent representation to be decoded back into audio. @@ -492,7 +494,9 @@ def decode(self, latents, padding_cache=None, use_cache=False): @can_return_tuple @auto_docstring - def forward(self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs): + def forward( + self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs + ) -> VibeVoiceAcousticTokenizerOutput: r""" input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`): Input audio waveform to be encoded into latent representation. diff --git a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py index 6212b61bd2a7..e5b9d2228df0 100644 --- a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py +++ b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py @@ -544,7 +544,7 @@ def forward( use_padding_cache: bool | None = None, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" padding_cache (`VoxtralRealtimeConv1dPaddingCache`, *optional*): Cache for padding in convolutional layers to maintain state across streaming chunks. @@ -1000,7 +1000,7 @@ def get_audio_features( past_key_values: Cache | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py index 8dc007c8daaa..3c5f2a867911 100644 --- a/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py +++ b/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py @@ -374,7 +374,7 @@ def forward( use_padding_cache: bool | None = None, attention_mask: torch.Tensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" padding_cache (`VoxtralRealtimeConv1dPaddingCache`, *optional*): Cache for padding in convolutional layers to maintain state across streaming chunks. @@ -610,7 +610,7 @@ def get_audio_features( past_key_values: Cache | None = None, use_cache: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: r""" input_features (`torch.FloatTensor`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index ca545f97bd6a..110bd10c5d63 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -565,7 +565,7 @@ def forward( output_hidden_states: bool | None = None, return_dict: bool | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutput: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): From b05bc5d01e20e2fcf2b164888b611782eef1389b Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Fri, 20 Feb 2026 11:02:26 +0100 Subject: [PATCH 20/20] Update incorrect typings on modular classes that inherit decorators --- src/transformers/models/mlcd/modular_mlcd.py | 2 +- src/transformers/models/qwen3_5/modular_qwen3_5.py | 4 ++-- src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py index 443acec5a513..4e2442b4e024 100644 --- a/src/transformers/models/mlcd/modular_mlcd.py +++ b/src/transformers/models/mlcd/modular_mlcd.py @@ -399,7 +399,7 @@ def forward( self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: if pixel_values is None: raise ValueError("You have to specify pixel_values") diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py index e34cee3d41a3..2f80eae3b8b7 100644 --- a/src/transformers/models/qwen3_5/modular_qwen3_5.py +++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py @@ -697,7 +697,7 @@ class Qwen3_5Model(Qwen3VLModel): def get_video_features( self, **super_kwargs, - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: # Same implementation as for images return super().get_video_features(**super_kwargs) @@ -706,7 +706,7 @@ def get_image_features( pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None, **kwargs: Unpack[TransformersKwargs], - ) -> tuple | BaseModelOutputWithPooling: + ) -> BaseModelOutputWithPooling: pixel_values = pixel_values.type(self.visual.dtype) vision_output: BaseModelOutputWithPooling = self.visual( pixel_values, grid_thw=image_grid_thw, return_dict=True, **kwargs diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index 40fe39991171..24c86fdbe27c 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -354,7 +354,7 @@ def forward( visual_pos_masks: torch.Tensor | None = None, deepstack_visual_embeds: list[torch.Tensor] | None = None, **kwargs: Unpack[FlashAttentionKwargs], - ) -> tuple | MoeModelOutputWithPast: + ) -> MoeModelOutputWithPast: r""" visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): The mask of the visual positions.