From bffa96344e81a7338be53c716476d80538dc8786 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 13:36:44 +0100
Subject: [PATCH 01/20] Automatically update
 can_return_tuple/check_model_inputs wrapped return type

This updates the typings of these two functions, so that a wrapped function that has return type X is automatically typed as `tuple | X`.
---
 src/transformers/utils/generic.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index e530b5f9acc8..b459091afb68 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -25,14 +25,19 @@
 from dataclasses import dataclass, fields, is_dataclass
 from enum import Enum
 from functools import partial, wraps
-from typing import Any, Optional, TypedDict
+from typing import Any, Optional, TypedDict, TypeVar
 
 import numpy as np
+from typing_extensions import ParamSpec
 
 from ..utils import logging
 from .import_utils import is_mlx_available, is_torch_available, is_torch_fx_proxy, requires
 
 
+P = ParamSpec("P")
+T = TypeVar("T")
+
+
 _CAN_RECORD_REGISTRY = {}
 
 
@@ -817,7 +822,7 @@ def del_attribute_from_modules(module: "torch.nn.Module", key: str):
         del_attribute_from_modules(submodule, key)
 
 
-def can_return_tuple(func):
+def can_return_tuple(func: Callable[P, T]) -> Callable[P, tuple | T]:
     """
     Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or
     use_return_dict=False is set in the config.
@@ -827,12 +832,13 @@ def can_return_tuple(func):
     """
 
     @wraps(func)
-    def wrapper(self, *args, **kwargs):
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> tuple | T:
+        self = args[0]
         return_dict = self.config.return_dict if hasattr(self, "config") else True
         return_dict_passed = kwargs.pop("return_dict", return_dict)
         if return_dict_passed is not None:
             return_dict = return_dict_passed
-        output = func(self, *args, **kwargs)
+        output = func(*args, **kwargs)
         if not return_dict and not isinstance(output, tuple):
             output = output.to_tuple()
         return output
@@ -859,7 +865,9 @@ class OutputRecorder:
     class_name: str | None = None
 
 
-def check_model_inputs(func=None, *, tie_last_hidden_states=True):
+def check_model_inputs(
+    func: Callable[P, T] | None = None, *, tie_last_hidden_states: bool = True
+) -> Callable[P, tuple | T]:
     """
     Decorator to intercept specific layer outputs without using hooks.
     Compatible with torch.compile (Dynamo tracing).
@@ -872,9 +880,10 @@ def check_model_inputs(func=None, *, tie_last_hidden_states=True):
             is needed for some vision models (e.g. CLIP, SigLIP)
     """
 
-    def wrapped_fn(func):
+    def wrapped_fn(func: Callable[P, T]) -> Callable[P, tuple | T]:
         @wraps(func)
-        def wrapper(self, *args, **kwargs):
+        def wrapper(*args: P.args, **kwargs: P.kwargs) -> tuple | T:
+            self, *args = args
             args_with_config_defaults = [
                 "use_cache",
                 "vision_feature_layer",

From 676206964999bea2a85cb792c13a2bbffb5bcfc7 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 13:37:32 +0100
Subject: [PATCH 02/20] Add check_decorator_return_types.py script to check
 against 'tuple | X'

It verifies that users don't use e.g. `tuple | BaseModelOutputWithPooling` return typings anymore, as they should use `BaseModelOutputWithPooling` instead then. It also makes sure that a typing is used
---
 .circleci/config.yml                          |   1 +
 .github/workflows/pr-repo-consistency-bot.yml |   2 +
 utils/check_decorator_return_types.py         | 351 ++++++++++++++++++
 3 files changed, 354 insertions(+)
 create mode 100644 utils/check_decorator_return_types.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7875cdc368f5..c21aab20bd8a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -189,6 +189,7 @@ jobs:
             - run: python utils/check_config_docstrings.py
             - run: python utils/check_config_attributes.py
             - run: python utils/check_doctest_list.py
+            - run: python utils/check_decorator_return_types.py
             - run: python utils/update_metadata.py --check-only
             - run: python utils/add_dates.py --check-only
             - run: > 
diff --git a/.github/workflows/pr-repo-consistency-bot.yml b/.github/workflows/pr-repo-consistency-bot.yml
index 29046f8281c9..7ed47499377e 100644
--- a/.github/workflows/pr-repo-consistency-bot.yml
+++ b/.github/workflows/pr-repo-consistency-bot.yml
@@ -165,6 +165,7 @@ jobs:
           cp utils/check_pipeline_typing.py pr-repo/utils/check_pipeline_typing.py
           cp utils/check_doctest_list.py pr-repo/utils/check_doctest_list.py
           cp utils/check_docstrings.py pr-repo/utils/check_docstrings.py
+          cp utils/check_decorator_return_types.py pr-repo/utils/check_decorator_return_types.py
           cp utils/add_dates.py pr-repo/utils/add_dates.py
           
           # Run commands in PR directory (with the copied trusted scripts)
@@ -185,6 +186,7 @@ jobs:
           python utils/check_pipeline_typing.py --fix_and_overwrite
           python utils/check_doctest_list.py --fix_and_overwrite
           python utils/check_docstrings.py --fix_and_overwrite
+          python utils/check_decorator_return_types.py --fix_and_overwrite
           python utils/add_dates.py
           
           # Check if there are changes
diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py
new file mode 100644
index 000000000000..2c6cbb3d5ac0
--- /dev/null
+++ b/utils/check_decorator_return_types.py
@@ -0,0 +1,351 @@
+# Copyright 2026 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AST-based checks for decorators that modify return types.
+
+This script ensures that functions decorated with `can_return_tuple` or
+`check_model_inputs`:
+
+1. Have an explicit, non-`None` return annotation.
+2. Are not annotated with a union that already includes `tuple`.
+
+The intention is that the decorators themselves are responsible for
+adding the `tuple` part of the return type, so the underlying function
+should be annotated with just the base return type.
+
+Usage (from the root of the repo):
+
+```bash
+python utils/check_decorator_return_types.py
+```
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+
+PATH_TO_TRANSFORMERS = "src/transformers"
+
+
+TARGET_DECORATORS = {"can_return_tuple", "check_model_inputs"}
+
+
+@dataclass
+class Violation:
+    file_path: str
+    line: int
+    function_name: str
+    decorator_name: str
+    message: str
+
+    def format(self) -> str:
+        return (
+            f"{self.file_path}:{self.line}: function '{self.function_name}' "
+            f"decorated with '@{self.decorator_name}' {self.message}"
+        )
+
+
+def _iter_python_files(root: str) -> Iterable[str]:
+    for dirpath, _, filenames in os.walk(root):
+        for filename in filenames:
+            if filename.endswith(".py"):
+                yield os.path.join(dirpath, filename)
+
+
+def _decorator_name(node: ast.expr) -> str | None:
+    """Return the simple name of a decorator, if it matches a target.
+
+    Handles forms like:
+    - @can_return_tuple
+    - @utils.can_return_tuple
+    - @can_return_tuple(...)
+    - @utils.check_model_inputs(...)
+    """
+
+    target = node
+    if isinstance(target, ast.Call):
+        target = target.func
+
+    if isinstance(target, ast.Name):
+        name = target.id
+    elif isinstance(target, ast.Attribute):
+        name = target.attr
+    else:
+        return None
+
+    if name in TARGET_DECORATORS:
+        return name
+    return None
+
+
+def _is_none_annotation(returns: ast.expr | None) -> bool:
+    if returns is None:
+        return True
+
+    # -> None
+    if isinstance(returns, ast.Constant) and returns.value is None:
+        return True
+
+    # -> None (as a name)
+    if isinstance(returns, ast.Name) and returns.id == "None":
+        return True
+
+    return False
+
+
+def _is_tuple_type(node: ast.AST) -> bool:
+    """Return True if the node represents a tuple type.
+
+    We conservatively treat the following as tuple types:
+    - `tuple`
+    - `tuple[...]`
+    - `Tuple[...]` (from typing)
+    """
+
+    if isinstance(node, ast.Name) and node.id in {"tuple", "Tuple"}:
+        return True
+
+    if isinstance(node, ast.Subscript):
+        value = node.value
+        if isinstance(value, ast.Name) and value.id in {"tuple", "Tuple"}:
+            return True
+
+    return False
+
+
+def _iter_union_members(node: ast.AST) -> Iterable[ast.AST]:
+    """Yield flattened members of a PEP 604-style union (X | Y | Z).
+
+    For non-union nodes, yields the node itself once.
+    """
+
+    if isinstance(node, ast.BinOp) and isinstance(node.op, ast.BitOr):
+        yield from _iter_union_members(node.left)
+        yield from _iter_union_members(node.right)
+    else:
+        yield node
+
+
+def _has_tuple_in_union(returns: ast.expr) -> bool:
+    members = list(_iter_union_members(returns))
+    if len(members) <= 1:
+        # Not a union
+        return False
+
+    return any(_is_tuple_type(member) for member in members)
+
+
+def _is_delegating_to_super(func_node: ast.AST) -> bool:
+    """Return True if the function body starts with a super(...) delegation.
+
+    We ignore functions whose first non-docstring statement is either:
+    - `return super(...` (possibly via an attribute like `super().foo(...)`), or
+    - `super(...` as a bare expression.
+    """
+
+    if not isinstance(func_node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+        return False
+
+    body = getattr(func_node, "body", [])
+    if not body:
+        return False
+
+    # Skip an initial docstring expression if present.
+    first_stmt_idx = 0
+    if (
+        isinstance(body[0], ast.Expr)
+        and isinstance(body[0].value, ast.Constant)
+        and isinstance(body[0].value.value, str)
+    ):
+        first_stmt_idx = 1
+
+    if first_stmt_idx >= len(body):
+        return False
+
+    first_stmt = body[first_stmt_idx]
+    if isinstance(first_stmt, ast.Return):
+        target = first_stmt.value
+    elif isinstance(first_stmt, ast.Expr):
+        target = first_stmt.value
+    else:
+        return False
+
+    if target is None:
+        return False
+
+    # Look for a super(...) call anywhere in the expression tree.
+    for node in ast.walk(target):
+        if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "super":
+            return True
+
+    return False
+
+
+def _collect_decorated_functions(tree: ast.AST, file_path: str) -> list[tuple[ast.AST, str]]:
+    """Return (function_node, decorator_name) pairs for targeted decorators."""
+
+    functions: list[tuple[ast.AST, str]] = []
+    for node in ast.walk(tree):
+        if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            continue
+        if not node.decorator_list:
+            continue
+        for deco in node.decorator_list:
+            name = _decorator_name(deco)
+            if name is not None:
+                functions.append((node, name))
+                break
+    return functions
+
+
+def _compute_line_offsets(source: str) -> list[int]:
+    """Return starting offset in the full string for each line (0-based)."""
+
+    offsets = [0]
+    total = 0
+    for line in source.splitlines(keepends=True):
+        total += len(line)
+        offsets.append(total)
+    return offsets
+
+
+def _make_union_without_tuple(returns: ast.expr) -> str | None:
+    """Build a new union annotation string without any tuple-type members.
+
+    Returns the new annotation expression as a string, or None if it cannot
+    be constructed (e.g. all members were tuple types).
+    """
+
+    members = [m for m in _iter_union_members(returns) if not _is_tuple_type(m)]
+    if not members:
+        return None
+
+    # We rely on Python's built-in unparser (3.9+).
+    pieces = [ast.unparse(m) for m in members]
+    return " | ".join(pieces)
+
+
+def check_decorator_return_types(overwrite: bool = False):
+    all_violations: list[Violation] = []
+    unfixable_violations: list[Violation] = []
+
+    for file_path in _iter_python_files(PATH_TO_TRANSFORMERS):
+        with open(file_path, "r", encoding="utf-8") as f:
+            source = f.read()
+
+        try:
+            tree = ast.parse(source, filename=file_path, type_comments=True)
+        except SyntaxError as e:
+            print(f"Skipping {file_path} due to SyntaxError: {e}")
+            continue
+
+        functions = _collect_decorated_functions(tree, file_path)
+        if not functions:
+            continue
+
+        fixes: list[tuple[int, int, str]] = []  # (start, end, new_text)
+
+        for func_node, decorator_name in functions:
+            # Ignore trivial delegations like `return super(...` or `super(...`.
+            if _is_delegating_to_super(func_node):
+                continue
+
+            returns = func_node.returns
+
+            # 1. Must have a non-None return annotation.
+            if _is_none_annotation(returns):
+                v = Violation(
+                    file_path=file_path,
+                    line=func_node.lineno,
+                    function_name=func_node.name,
+                    decorator_name=decorator_name,
+                    message="must have a non-None return annotation",
+                )
+                all_violations.append(v)
+                unfixable_violations.append(v)
+                continue
+
+            # Nothing else to do without an annotation.
+            if returns is None:
+                continue
+
+            # 2. Annotation must not already be a union including `tuple`.
+            if _has_tuple_in_union(returns):
+                v = Violation(
+                    file_path=file_path,
+                    line=func_node.lineno,
+                    function_name=func_node.name,
+                    decorator_name=decorator_name,
+                    message="must not be annotated with a union that includes 'tuple'",
+                )
+                all_violations.append(v)
+
+                if not overwrite:
+                    continue
+
+                new_annotation = _make_union_without_tuple(returns)
+                if new_annotation is None:
+                    unfixable_violations.append(v)
+                    continue
+
+                # Use precise offsets to replace just the annotation.
+                if not hasattr(returns, "lineno") or not hasattr(returns, "end_lineno"):
+                    unfixable_violations.append(v)
+                    continue
+
+                line_offsets = _compute_line_offsets(source)
+                try:
+                    start = line_offsets[returns.lineno - 1] + returns.col_offset
+                    end = line_offsets[returns.end_lineno - 1] + returns.end_col_offset
+                except IndexError:
+                    unfixable_violations.append(v)
+                    continue
+
+                fixes.append((start, end, new_annotation))
+
+        if overwrite and fixes:
+            # Apply fixes from the end of the file backwards so offsets stay valid.
+            fixes.sort(key=lambda x: x[0], reverse=True)
+            new_source = source
+            for start, end, text in fixes:
+                new_source = new_source[:start] + text + new_source[end:]
+
+            if new_source != source:
+                print(f"Updating return annotations in {file_path} to drop 'tuple' from unions.")
+                with open(file_path, "w", encoding="utf-8", newline="\n") as f:
+                    f.write(new_source)
+
+    if all_violations and not overwrite:
+        header = "Found decorator return-type violations:\n\n"
+        body = "\n".join(v.format() for v in all_violations)
+        footer = "\n\nRun this script with --fix_and_overwrite to auto-fix some violations."
+        raise ValueError(header + body + footer)
+
+    if overwrite and unfixable_violations:
+        header = "Found decorator return-type violations that could not be auto-fixed:\n\n"
+        body = "\n".join(v.format() for v in unfixable_violations)
+        footer = "\n\nPlease fix these annotations manually."
+        raise ValueError(header + body + footer)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
+    args = parser.parse_args()
+
+    check_decorator_return_types(args.fix_and_overwrite)

From fa9caf7cdcf0fc3c0a421a24b04f081f06fd0dde Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 13:38:36 +0100
Subject: [PATCH 03/20] Run checking script & fix existing typings

---
 .../models/afmoe/modeling_afmoe.py            |  2 +-
 .../models/afmoe/modular_afmoe.py             |  2 +-
 .../models/aimv2/modeling_aimv2.py            |  4 ++--
 .../models/albert/modeling_albert.py          | 14 ++++++-------
 .../models/align/modeling_align.py            | 12 +++++------
 .../models/altclip/modeling_altclip.py        | 12 +++++------
 src/transformers/models/aria/modeling_aria.py |  6 +++---
 src/transformers/models/aria/modular_aria.py  |  6 +++---
 .../audioflamingo3/modeling_audioflamingo3.py |  4 ++--
 .../audioflamingo3/modular_audioflamingo3.py  |  4 ++--
 .../models/aya_vision/modeling_aya_vision.py  |  6 +++---
 .../models/aya_vision/modular_aya_vision.py   |  6 +++---
 src/transformers/models/bert/modeling_bert.py | 18 ++++++++---------
 .../modeling_bert_generation.py               |  4 ++--
 src/transformers/models/blip/modeling_blip.py | 14 ++++++-------
 .../models/blip_2/modeling_blip_2.py          | 20 +++++++++----------
 .../bridgetower/modeling_bridgetower.py       |  2 +-
 src/transformers/models/bros/modeling_bros.py | 10 +++++-----
 .../models/camembert/modeling_camembert.py    | 14 ++++++-------
 .../models/camembert/modular_camembert.py     | 12 +++++------
 .../models/chameleon/modeling_chameleon.py    |  4 ++--
 .../chinese_clip/modeling_chinese_clip.py     | 14 ++++++-------
 src/transformers/models/clap/modeling_clap.py | 14 ++++++-------
 src/transformers/models/clip/modeling_clip.py |  4 ++--
 .../models/clipseg/modeling_clipseg.py        |  6 +++---
 src/transformers/models/clvp/modeling_clvp.py |  2 +-
 .../cohere2_vision/modeling_cohere2_vision.py |  6 +++---
 .../cohere2_vision/modular_cohere2_vision.py  |  6 +++---
 src/transformers/models/csm/modeling_csm.py   |  6 +++---
 src/transformers/models/csm/modular_csm.py    |  6 +++---
 .../models/data2vec/modeling_data2vec_text.py | 14 ++++++-------
 .../models/data2vec/modular_data2vec_text.py  | 12 +++++------
 .../deepseek_vl/modeling_deepseek_vl.py       |  2 +-
 .../modeling_deepseek_vl_hybrid.py            |  2 +-
 .../modular_deepseek_vl_hybrid.py             |  2 +-
 src/transformers/models/dia/modeling_dia.py   |  8 ++++----
 src/transformers/models/dia/modular_dia.py    |  8 ++++----
 .../models/distilbert/modeling_distilbert.py  | 12 +++++------
 .../models/edgetam/modeling_edgetam.py        |  4 ++--
 .../models/edgetam/modular_edgetam.py         |  2 +-
 .../edgetam_video/modeling_edgetam_video.py   |  2 +-
 .../models/electra/modeling_electra.py        | 16 +++++++--------
 src/transformers/models/emu3/modeling_emu3.py |  6 +++---
 src/transformers/models/emu3/modular_emu3.py  |  6 +++---
 .../modeling_encoder_decoder.py               |  2 +-
 .../models/ernie/modeling_ernie.py            | 18 ++++++++---------
 .../models/ernie/modular_ernie.py             | 18 ++++++++---------
 .../modeling_ernie4_5_vl_moe.py               | 10 +++++-----
 .../modular_ernie4_5_vl_moe.py                | 10 +++++-----
 src/transformers/models/esm/modeling_esm.py   | 10 +++++-----
 .../models/evolla/modeling_evolla.py          | 12 ++++++-----
 .../models/evolla/modular_evolla.py           | 10 ++++++----
 .../models/exaone4/modeling_exaone4.py        |  2 +-
 .../models/exaone4/modular_exaone4.py         |  2 +-
 .../models/falcon_h1/modeling_falcon_h1.py    |  4 ++--
 .../models/falcon_h1/modular_falcon_h1.py     |  4 ++--
 .../models/fast_vlm/modeling_fast_vlm.py      |  6 +++---
 .../models/fast_vlm/modular_fast_vlm.py       |  6 +++---
 .../models/flava/modeling_flava.py            |  4 ++--
 .../models/florence2/modeling_florence2.py    | 10 ++++------
 .../models/florence2/modular_florence2.py     | 10 ++++------
 src/transformers/models/fuyu/modeling_fuyu.py |  4 ++--
 .../models/gemma3/modeling_gemma3.py          |  6 +++---
 .../models/gemma3/modular_gemma3.py           |  6 +++---
 .../models/gemma3n/modeling_gemma3n.py        |  6 +++---
 .../models/gemma3n/modular_gemma3n.py         |  6 +++---
 src/transformers/models/git/modeling_git.py   |  2 +-
 src/transformers/models/glm4/modeling_glm4.py |  2 +-
 src/transformers/models/glm4/modular_glm4.py  |  2 +-
 .../models/glm46v/modeling_glm46v.py          |  8 ++++----
 .../models/glm4v/modeling_glm4v.py            | 12 +++++------
 .../models/glm4v/modular_glm4v.py             | 10 +++++-----
 .../models/glm4v_moe/modeling_glm4v_moe.py    | 12 +++++------
 .../models/glm4v_moe/modular_glm4v_moe.py     |  4 ++--
 .../models/glm_image/modeling_glm_image.py    |  8 ++++----
 .../models/glm_image/modular_glm_image.py     |  6 +++---
 .../models/glmasr/modeling_glmasr.py          |  4 ++--
 .../models/glmasr/modular_glmasr.py           |  4 ++--
 .../models/got_ocr2/modeling_got_ocr2.py      |  8 ++++----
 .../models/got_ocr2/modular_got_ocr2.py       |  6 +++---
 .../gpt_bigcode/modeling_gpt_bigcode.py       |  2 +-
 .../models/gpt_neox/modeling_gpt_neox.py      |  2 +-
 .../models/gpt_neox/modular_gpt_neox.py       |  2 +-
 .../granite_speech/modeling_granite_speech.py |  6 ++----
 .../models/granitemoe/modeling_granitemoe.py  |  2 +-
 .../models/granitemoe/modular_granitemoe.py   |  2 +-
 .../modeling_granitemoehybrid.py              |  4 ++--
 .../modular_granitemoehybrid.py               |  2 +-
 .../modeling_granitemoeshared.py              |  2 +-
 .../models/groupvit/modeling_groupvit.py      |  4 ++--
 .../models/idefics/modeling_idefics.py        |  4 ++--
 src/transformers/models/idefics/vision.py     |  2 +-
 .../models/idefics2/modeling_idefics2.py      |  8 ++++----
 .../models/idefics3/modeling_idefics3.py      |  8 ++++----
 .../instructblip/modeling_instructblip.py     | 12 +++++------
 .../modeling_instructblipvideo.py             | 12 +++++------
 .../modular_instructblipvideo.py              |  6 +++---
 .../models/internvl/modeling_internvl.py      |  8 ++++----
 .../models/internvl/modular_internvl.py       |  6 +++---
 .../models/janus/modeling_janus.py            |  4 ++--
 .../models/janus/modular_janus.py             |  4 ++--
 .../models/kosmos2/modeling_kosmos2.py        | 12 +++++------
 .../models/layoutlm/modeling_layoutlm.py      | 12 +++++------
 .../models/lfm2_vl/modeling_lfm2_vl.py        |  6 +++---
 .../models/lfm2_vl/modular_lfm2_vl.py         |  6 +++---
 .../lighton_ocr/modeling_lighton_ocr.py       |  6 +++---
 .../models/lighton_ocr/modular_lighton_ocr.py |  4 ++--
 .../models/llama4/modeling_llama4.py          |  8 ++++----
 .../models/llava/modeling_llava.py            |  6 +++---
 .../models/llava_next/modeling_llava_next.py  |  6 +++---
 .../modeling_llava_next_video.py              |  8 ++++----
 .../modular_llava_next_video.py               |  8 ++++----
 .../modeling_llava_onevision.py               |  8 ++++----
 .../modular_llava_onevision.py                |  8 ++++----
 .../models/markuplm/modeling_markuplm.py      | 10 +++++-----
 .../models/metaclip_2/modeling_metaclip_2.py  |  4 ++--
 .../models/metaclip_2/modular_metaclip_2.py   |  4 ++--
 .../models/minimax/modeling_minimax.py        |  2 +-
 .../models/minimax/modular_minimax.py         |  2 +-
 .../models/mistral3/modeling_mistral3.py      |  6 +++---
 .../models/mistral3/modular_mistral3.py       |  6 +++---
 src/transformers/models/mlcd/modeling_mlcd.py |  2 +-
 src/transformers/models/mlcd/modular_mlcd.py  |  2 +-
 .../models/mllama/modeling_mllama.py          |  4 ++--
 .../models/mobilebert/modeling_mobilebert.py  | 16 +++++++--------
 .../modeling_modernbert_decoder.py            |  6 +++---
 .../modular_modernbert_decoder.py             |  6 +++---
 .../models/moonshine/modeling_moonshine.py    |  4 ++--
 .../models/moonshine/modular_moonshine.py     |  4 ++--
 .../models/nllb_moe/modeling_nllb_moe.py      |  8 ++++----
 src/transformers/models/opt/modeling_opt.py   |  6 +++---
 .../models/ovis2/modeling_ovis2.py            | 10 +++++-----
 .../models/ovis2/modular_ovis2.py             | 10 +++++-----
 .../models/owlv2/modeling_owlv2.py            |  4 ++--
 .../models/owlvit/modeling_owlvit.py          |  4 ++--
 .../paddleocr_vl/modeling_paddleocr_vl.py     |  8 ++++----
 .../paddleocr_vl/modular_paddleocr_vl.py      |  8 ++++----
 .../models/paligemma/modeling_paligemma.py    |  6 +++---
 .../models/pe_audio/modeling_pe_audio.py      |  2 +-
 .../models/pe_audio/modular_pe_audio.py       |  2 +-
 .../pe_audio_video/modeling_pe_audio_video.py |  2 +-
 .../pe_audio_video/modular_pe_audio_video.py  |  2 +-
 .../models/pe_video/modeling_pe_video.py      |  6 +++---
 .../models/pe_video/modular_pe_video.py       |  6 +++---
 .../perception_lm/modeling_perception_lm.py   |  6 +++---
 .../perception_lm/modular_perception_lm.py    |  6 +++---
 .../modeling_phi4_multimodal.py               |  4 ++--
 .../modular_phi4_multimodal.py                |  4 ++--
 .../models/pixtral/modeling_pixtral.py        |  2 +-
 .../qwen2_5_omni/modeling_qwen2_5_omni.py     | 12 ++++++-----
 .../qwen2_5_omni/modular_qwen2_5_omni.py      | 12 ++++++-----
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py  |  8 ++++----
 .../models/qwen2_5_vl/modular_qwen2_5_vl.py   |  4 ++--
 .../models/qwen2_vl/modeling_qwen2_vl.py      |  6 +++---
 .../qwen3_omni_moe/modeling_qwen3_omni_moe.py | 16 +++++++--------
 .../qwen3_omni_moe/modular_qwen3_omni_moe.py  |  4 ++--
 .../models/qwen3_vl/modeling_qwen3_vl.py      | 12 +++++------
 .../models/qwen3_vl/modular_qwen3_vl.py       | 12 +++++------
 .../qwen3_vl_moe/modeling_qwen3_vl_moe.py     | 12 +++++------
 .../qwen3_vl_moe/modular_qwen3_vl_moe.py      |  2 +-
 .../models/roberta/modeling_roberta.py        | 14 ++++++-------
 .../models/roberta/modular_roberta.py         | 12 +++++------
 .../modeling_roberta_prelayernorm.py          | 14 ++++++-------
 .../models/roc_bert/modeling_roc_bert.py      | 16 +++++++--------
 src/transformers/models/sam/modeling_sam.py   |  2 +-
 src/transformers/models/sam2/modeling_sam2.py |  6 +++---
 src/transformers/models/sam2/modular_sam2.py  |  6 +++---
 .../models/sam2_video/modeling_sam2_video.py  |  2 +-
 src/transformers/models/sam3/modeling_sam3.py | 10 +++++-----
 .../sam3_tracker/modeling_sam3_tracker.py     |  2 +-
 .../modeling_sam3_tracker_video.py            |  2 +-
 .../modular_sam3_tracker_video.py             |  2 +-
 .../models/sam_hq/modeling_sam_hq.py          |  2 +-
 .../models/sam_hq/modular_sam_hq.py           |  2 +-
 .../models/siglip/modeling_siglip.py          |  4 ++--
 .../models/siglip2/modeling_siglip2.py        |  4 ++--
 .../models/siglip2/modular_siglip2.py         |  2 +-
 .../models/smolvlm/modeling_smolvlm.py        |  8 ++++----
 .../models/smolvlm/modular_smolvlm.py         |  4 ++--
 .../models/splinter/modeling_splinter.py      |  4 ++--
 .../models/starcoder2/modeling_starcoder2.py  |  2 +-
 .../models/starcoder2/modular_starcoder2.py   |  2 +-
 .../modeling_switch_transformers.py           |  8 ++++----
 .../modular_switch_transformers.py            |  8 ++++----
 .../models/t5gemma/modeling_t5gemma.py        |  6 +++---
 .../models/t5gemma/modular_t5gemma.py         |  6 +++---
 .../models/t5gemma2/modeling_t5gemma2.py      |  4 ++--
 .../models/t5gemma2/modular_t5gemma2.py       |  4 ++--
 .../video_llama_3/modeling_video_llama_3.py   | 12 +++++------
 .../video_llama_3/modular_video_llama_3.py    | 12 +++++------
 .../video_llava/modeling_video_llava.py       |  8 ++++----
 .../models/vipllava/modeling_vipllava.py      |  4 ++--
 .../models/vipllava/modular_vipllava.py       |  4 ++--
 .../modeling_vision_text_dual_encoder.py      |  4 ++--
 .../modeling_vitpose_backbone.py              |  2 +-
 .../models/vjepa2/modeling_vjepa2.py          |  2 +-
 .../models/voxtral/modeling_voxtral.py        |  4 ++--
 .../models/voxtral/modular_voxtral.py         |  4 ++--
 .../models/x_clip/modeling_x_clip.py          |  6 +++---
 .../xlm_roberta/modeling_xlm_roberta.py       | 14 ++++++-------
 .../models/xlm_roberta/modular_xlm_roberta.py | 12 +++++------
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 14 ++++++-------
 .../xlm_roberta_xl/modular_xlm_roberta_xl.py  | 12 +++++------
 .../models/xlstm/modeling_xlstm.py            |  4 ++--
 src/transformers/models/xmod/modeling_xmod.py | 14 ++++++-------
 205 files changed, 683 insertions(+), 681 deletions(-)

diff --git a/src/transformers/models/afmoe/modeling_afmoe.py b/src/transformers/models/afmoe/modeling_afmoe.py
index 15e88fc1f00b..1a311011d4a8 100644
--- a/src/transformers/models/afmoe/modeling_afmoe.py
+++ b/src/transformers/models/afmoe/modeling_afmoe.py
@@ -571,7 +571,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/afmoe/modular_afmoe.py b/src/transformers/models/afmoe/modular_afmoe.py
index d81a659e905b..d3e9b5ffcce1 100644
--- a/src/transformers/models/afmoe/modular_afmoe.py
+++ b/src/transformers/models/afmoe/modular_afmoe.py
@@ -392,7 +392,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
index 1000765e53a8..b8dd43481756 100644
--- a/src/transformers/models/aimv2/modeling_aimv2.py
+++ b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -611,7 +611,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -646,7 +646,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 70c179cfd6a3..608a77cf6fbc 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -391,7 +391,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> BaseModelOutputWithPooling | tuple:
+    ) -> BaseModelOutputWithPooling:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -465,7 +465,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         sentence_order_label: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> AlbertForPreTrainingOutput | tuple:
+    ) -> AlbertForPreTrainingOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -594,7 +594,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> MaskedLMOutput | tuple:
+    ) -> MaskedLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -686,7 +686,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> SequenceClassifierOutput | tuple:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -768,7 +768,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> TokenClassifierOutput | tuple:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -825,7 +825,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> AlbertForPreTrainingOutput | tuple:
+    ) -> AlbertForPreTrainingOutput:
         outputs = self.albert(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -892,7 +892,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> AlbertForPreTrainingOutput | tuple:
+    ) -> AlbertForPreTrainingOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 98d14fbb307c..91e0d71b9ab7 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -772,7 +772,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -898,7 +898,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1011,7 +1011,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPoolingAndNoAttention:
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
         r"""
         Examples:
 
@@ -1103,7 +1103,7 @@ def get_text_features(
         position_ids: torch.Tensor | None = None,
         inputs_embeds: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1136,7 +1136,7 @@ def get_text_features(
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1172,7 +1172,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | AlignOutput:
+    ) -> AlignOutput:
         r"""
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 13c94fa88362..62ffd1cc8154 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -385,7 +385,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -617,7 +617,7 @@ def forward(
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -842,7 +842,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         interpolate_pos_encoding: bool | None = False,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1069,7 +1069,7 @@ def forward(
         return_dict: bool | None = None,
         output_hidden_states: bool | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPoolingAndProjection:
+    ) -> BaseModelOutputWithPoolingAndProjection:
         r"""
         Examples:
 
@@ -1164,7 +1164,7 @@ def get_text_features(
         position_ids: torch.Tensor | None = None,
         token_type_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1199,7 +1199,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py
index 91aecac948b7..1b23b30f9125 100644
--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@@ -926,7 +926,7 @@ def get_image_features(
         vision_feature_layer: int = -1,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
         image_outputs = self.vision_tower(
             pixel_values,
@@ -983,7 +983,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | AriaModelOutputWithPast:
+    ) -> AriaModelOutputWithPast:
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
@@ -1099,7 +1099,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | AriaCausalLMOutputWithPast:
+    ) -> AriaCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index cd84941a9c34..708187ff3891 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -1271,7 +1271,7 @@ def get_image_features(
         vision_feature_layer: int = -1,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         patch_attention_mask = self._create_patch_attention_mask(pixel_mask)
         image_outputs = self.vision_tower(
             pixel_values,
@@ -1304,7 +1304,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | AriaModelOutputWithPast:
+    ) -> AriaModelOutputWithPast:
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
@@ -1383,7 +1383,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | AriaCausalLMOutputWithPast:
+    ) -> AriaCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
index f88a19796f34..6794248d12a9 100644
--- a/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modeling_audioflamingo3.py
@@ -327,7 +327,7 @@ def forward(
         input_features: torch.Tensor,
         input_features_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Args:
             input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
@@ -455,7 +455,7 @@ def get_audio_features(
         input_features: torch.FloatTensor,
         input_features_mask: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
diff --git a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
index b846957940cc..2999993f01e6 100644
--- a/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
+++ b/src/transformers/models/audioflamingo3/modular_audioflamingo3.py
@@ -68,7 +68,7 @@ def forward(
         input_features: torch.Tensor,
         input_features_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Args:
             input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
@@ -156,7 +156,7 @@ def get_audio_features(
         input_features: torch.FloatTensor,
         input_features_mask: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py
index 91517071cb17..b2d9454cafe1 100644
--- a/src/transformers/models/aya_vision/modeling_aya_vision.py
+++ b/src/transformers/models/aya_vision/modeling_aya_vision.py
@@ -190,7 +190,7 @@ def get_image_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
         image_outputs = self.vision_tower(
@@ -256,7 +256,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | AyaVisionModelOutputWithPast:
+    ) -> AyaVisionModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -356,7 +356,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | AyaVisionCausalLMOutputWithPast:
+    ) -> AyaVisionCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/aya_vision/modular_aya_vision.py b/src/transformers/models/aya_vision/modular_aya_vision.py
index 378a826e1f2e..e5880791638c 100644
--- a/src/transformers/models/aya_vision/modular_aya_vision.py
+++ b/src/transformers/models/aya_vision/modular_aya_vision.py
@@ -115,7 +115,7 @@ def get_image_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
         image_outputs = self.vision_tower(
@@ -157,7 +157,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | AyaVisionModelOutputWithPast:
+    ) -> AyaVisionModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -212,7 +212,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | AyaVisionCausalLMOutputWithPast:
+    ) -> AyaVisionCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 1f3d08a5ce31..ca0f85f73bf6 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -649,7 +649,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         if self.config.is_decoder:
             use_cache = use_cache if use_cache is not None else self.config.use_cache
         else:
@@ -788,7 +788,7 @@ def forward(
         labels: torch.Tensor | None = None,
         next_sentence_label: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BertForPreTrainingOutput:
+    ) -> BertForPreTrainingOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -893,7 +893,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
@@ -979,7 +979,7 @@ def forward(
         encoder_attention_mask: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -1064,7 +1064,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | NextSentencePredictorOutput:
+    ) -> NextSentencePredictorOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
@@ -1152,7 +1152,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -1231,7 +1231,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1330,7 +1330,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -1387,7 +1387,7 @@ def forward(
         start_positions: torch.Tensor | None = None,
         end_positions: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index bccb17124aa9..b3a5f344379a 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -518,7 +518,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPastAndCrossAttentions:
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         if self.config.is_decoder:
             use_cache = use_cache if use_cache is not None else self.config.use_cache
         else:
@@ -673,7 +673,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index e79ccdfa7d5e..919c938bdf12 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -486,7 +486,7 @@ def forward(
         pixel_values: torch.FloatTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -570,7 +570,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -602,7 +602,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -693,7 +693,7 @@ def forward(
         return_loss: bool | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BlipOutput:
+    ) -> BlipOutput:
         r"""
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
@@ -809,7 +809,7 @@ def forward(
         interpolate_pos_encoding: bool = False,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BlipForConditionalGenerationModelOutput:
+    ) -> BlipForConditionalGenerationModelOutput:
         r"""
         Examples:
 
@@ -980,7 +980,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BlipTextVisionModelOutput:
+    ) -> BlipTextVisionModelOutput:
         r"""
         Examples:
 
@@ -1208,7 +1208,7 @@ def forward(
         attention_mask: torch.LongTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BlipTextVisionModelOutput:
+    ) -> BlipTextVisionModelOutput:
         r"""
         use_itm_head (`bool`, *optional*, defaults to `True`):
             Whether or not to use the image-text matching head.
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 0ec86489224e..cf5beb05d93b 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -507,7 +507,7 @@ def forward(
         pixel_values: torch.FloatTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -807,7 +807,7 @@ def forward(
         encoder_attention_mask=None,
         query_length=0,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         for i in range(self.config.num_hidden_layers):
             layer_module = self.layer[i]
 
@@ -963,7 +963,7 @@ def forward(
         encoder_hidden_states: torch.FloatTensor | None = None,
         encoder_attention_mask: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
             Hidden states to be used in the attention computation. If cross-attention,
@@ -1091,7 +1091,7 @@ def get_text_features(
         decoder_attention_mask: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
@@ -1150,7 +1150,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
         ```python
@@ -1251,7 +1251,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Blip2ForConditionalGenerationModelOutput:
+    ) -> Blip2ForConditionalGenerationModelOutput:
         r"""
         decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
@@ -1396,7 +1396,7 @@ def forward(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Blip2TextModelOutput:
+    ) -> Blip2TextModelOutput:
         r"""
         Examples:
 
@@ -1478,7 +1478,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Blip2VisionModelOutput:
+    ) -> Blip2VisionModelOutput:
         r"""
         Examples:
 
@@ -1622,7 +1622,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool | None = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithVisionQformerOutputs:
+    ) -> BaseModelOutputWithVisionQformerOutputs:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1696,7 +1696,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Blip2ForConditionalGenerationModelOutput:
+    ) -> Blip2ForConditionalGenerationModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 15553adbf315..2c0ce5cac266 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -1029,7 +1029,7 @@ def forward(
         output_hidden_states: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py
index 9b840fdcd5ab..e2cce8060118 100755
--- a/src/transformers/models/bros/modeling_bros.py
+++ b/src/transformers/models/bros/modeling_bros.py
@@ -423,7 +423,7 @@ def forward(
         output_attentions: bool | None = False,
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithCrossAttentions:
+    ) -> BaseModelOutputWithCrossAttentions:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
@@ -570,7 +570,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
             Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values
@@ -709,7 +709,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
             Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values
@@ -830,7 +830,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BrosSpadeOutput:
+    ) -> BrosSpadeOutput:
         r"""
         bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
             Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values
@@ -967,7 +967,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'):
             Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index 7bdcb24be61e..528c75128033 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -628,7 +628,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         if self.config.is_decoder:
             use_cache = use_cache if use_cache is not None else self.config.use_cache
         else:
@@ -768,7 +768,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -864,7 +864,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -948,7 +948,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1049,7 +1049,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -1117,7 +1117,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -1218,7 +1218,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
diff --git a/src/transformers/models/camembert/modular_camembert.py b/src/transformers/models/camembert/modular_camembert.py
index a7d98b334983..3e903ef9652d 100644
--- a/src/transformers/models/camembert/modular_camembert.py
+++ b/src/transformers/models/camembert/modular_camembert.py
@@ -74,7 +74,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -137,7 +137,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -216,7 +216,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -308,7 +308,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -371,7 +371,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -450,7 +450,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py
index d4ec10c50bdc..48e8c431450e 100644
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@@ -893,7 +893,7 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor):
     )
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1081,7 +1081,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index e3e45c96d5d5..bba93ac10d96 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -632,7 +632,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -683,7 +683,7 @@ def forward(
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -750,7 +750,7 @@ def forward(
         output_hidden_states: bool | None = None,
         interpolate_pos_encoding: bool = False,
         return_dict: bool | None = None,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -843,7 +843,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1010,7 +1010,7 @@ def get_text_features(
         token_type_ids: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1046,7 +1046,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1093,7 +1093,7 @@ def forward(
         interpolate_pos_encoding: bool = False,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | ChineseCLIPOutput:
+    ) -> ChineseCLIPOutput:
         r"""
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 64223c23e8c1..b799bb92c75e 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1260,7 +1260,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -1458,7 +1458,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1561,7 +1561,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1596,7 +1596,7 @@ def get_audio_features(
         is_longer: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
             Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
@@ -1638,7 +1638,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | ClapOutput:
+    ) -> ClapOutput:
         r"""
         is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
             Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
@@ -1752,7 +1752,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | ClapTextModelOutput:
+    ) -> ClapTextModelOutput:
         r"""
         Examples:
 
@@ -1816,7 +1816,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | ClapAudioModelOutput:
+    ) -> ClapAudioModelOutput:
         r"""
         is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
             Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 54e02843c2de..6754fb97f9c2 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -789,7 +789,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -824,7 +824,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 818ebab02f88..b788b45a276f 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -499,7 +499,7 @@ def forward(
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -859,7 +859,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -893,7 +893,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = True,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py
index 1a9fcccea86d..f34a3781898e 100644
--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@@ -1505,7 +1505,7 @@ def get_text_features(
         text_encoder_inputs_embeds: torch.FloatTensor | None = None,
         attention_mask: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | ClvpEncoderOutput:
+    ) -> ClvpEncoderOutput:
         r"""
         text_encoder_inputs_embeds (`torch.FloatTensor`, *optional*):
             inputs_embeds for the text encoder model passed in place of `input_ids`.
diff --git a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
index 0123d7064a41..bd53ebd60030 100644
--- a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
@@ -171,7 +171,7 @@ def set_input_embeddings(self, value):
     )
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs)
         selected_image_feature = image_outputs.last_hidden_state
         image_outputs.pooler_output = self.multi_modal_projector(selected_image_feature)
@@ -215,7 +215,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | Cohere2VisionModelOutputWithPast:
+    ) -> Cohere2VisionModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -295,7 +295,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Cohere2VisionCausalLMOutputWithPast:
+    ) -> Cohere2VisionCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
index ffe43efe3b80..f3618cfacf4d 100644
--- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
@@ -102,7 +102,7 @@ class Cohere2VisionModel(AyaVisionModel):
     )
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs)
         selected_image_feature = image_outputs.last_hidden_state
         image_outputs.pooler_output = self.multi_modal_projector(selected_image_feature)
@@ -122,7 +122,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | Cohere2VisionModelOutputWithPast:
+    ) -> Cohere2VisionModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -181,7 +181,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Cohere2VisionCausalLMOutputWithPast:
+    ) -> Cohere2VisionCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py
index 77399420a9e9..50f1c7d2073d 100644
--- a/src/transformers/models/csm/modeling_csm.py
+++ b/src/transformers/models/csm/modeling_csm.py
@@ -455,7 +455,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
             The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
@@ -588,7 +588,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
             The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
@@ -961,7 +961,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CsmOutputWithPast:
+    ) -> CsmOutputWithPast:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
             1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
diff --git a/src/transformers/models/csm/modular_csm.py b/src/transformers/models/csm/modular_csm.py
index c19deb1d9f58..89f26a9e1f76 100644
--- a/src/transformers/models/csm/modular_csm.py
+++ b/src/transformers/models/csm/modular_csm.py
@@ -173,7 +173,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
             The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
@@ -325,7 +325,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
             The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
@@ -609,7 +609,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CsmOutputWithPast:
+    ) -> CsmOutputWithPast:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
             1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 93177572caea..071b20fc0dc3 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -591,7 +591,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         if self.config.is_decoder:
             use_cache = use_cache if use_cache is not None else self.config.use_cache
         else:
@@ -780,7 +780,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
@@ -881,7 +881,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -946,7 +946,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -1021,7 +1021,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1121,7 +1121,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -1180,7 +1180,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
diff --git a/src/transformers/models/data2vec/modular_data2vec_text.py b/src/transformers/models/data2vec/modular_data2vec_text.py
index ac77a81841d9..eca25868b4a6 100644
--- a/src/transformers/models/data2vec/modular_data2vec_text.py
+++ b/src/transformers/models/data2vec/modular_data2vec_text.py
@@ -145,7 +145,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
@@ -246,7 +246,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -311,7 +311,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -386,7 +386,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -486,7 +486,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -545,7 +545,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         outputs = self.data2vec_text(
             input_ids,
             attention_mask=attention_mask,
diff --git a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py
index 0113c1e5235c..7ef6e2f4a6e6 100644
--- a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py
@@ -157,7 +157,7 @@ def set_input_embeddings(self, value):
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         vision_outputs = self.vision_model(pixel_values, return_dict=True, **kwargs)
         vision_outputs.pooler_output = self.aligner(vision_outputs.last_hidden_state)
 
diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
index 15f653d7d5c0..cb4d8a7381c3 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
@@ -299,7 +299,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         high_res_pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithHighResVisionEncodings:
+    ) -> BaseModelOutputWithHighResVisionEncodings:
         low_res_outputs = self.get_low_res_image_features(pixel_values, **kwargs)
         high_res_outputs = self.get_high_res_image_features(high_res_pixel_values, **kwargs)
         image_features = self.aligner(low_res_outputs.last_hidden_state, high_res_outputs.last_hidden_state)
diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
index 8488833e6aaf..d0f2e06efc12 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -314,7 +314,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         high_res_pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithHighResVisionEncodings:
+    ) -> BaseModelOutputWithHighResVisionEncodings:
         low_res_outputs = self.get_low_res_image_features(pixel_values, **kwargs)
         high_res_outputs = self.get_high_res_image_features(high_res_pixel_values, **kwargs)
         image_features = self.aligner(low_res_outputs.last_hidden_state, high_res_outputs.last_hidden_state)
diff --git a/src/transformers/models/dia/modeling_dia.py b/src/transformers/models/dia/modeling_dia.py
index b2b7b34dd923..06e56c6b0e3f 100644
--- a/src/transformers/models/dia/modeling_dia.py
+++ b/src/transformers/models/dia/modeling_dia.py
@@ -467,7 +467,7 @@ def forward(
         output_attentions: bool | None = False,
         output_hidden_states: bool | None = False,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> BaseModelOutput | tuple:
+    ) -> BaseModelOutput:
         hidden_states = self.embedding(input_ids)
 
         # RoPE
@@ -600,7 +600,7 @@ def forward(
         output_hidden_states: bool | None = False,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> BaseModelOutputWithPastAndCrossAttentions | tuple:
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
             The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.
@@ -712,7 +712,7 @@ def forward(
         output_hidden_states: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | Seq2SeqModelOutput:
+    ) -> Seq2SeqModelOutput:
         r"""
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
         or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
@@ -849,7 +849,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | Seq2SeqLMOutput:
+    ) -> Seq2SeqLMOutput:
         r"""
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
         or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
diff --git a/src/transformers/models/dia/modular_dia.py b/src/transformers/models/dia/modular_dia.py
index d265bea97c51..c1c905d0708c 100644
--- a/src/transformers/models/dia/modular_dia.py
+++ b/src/transformers/models/dia/modular_dia.py
@@ -257,7 +257,7 @@ def forward(
         output_attentions: bool | None = False,
         output_hidden_states: bool | None = False,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> BaseModelOutput | tuple:
+    ) -> BaseModelOutput:
         hidden_states = self.embedding(input_ids)
 
         # RoPE
@@ -390,7 +390,7 @@ def forward(
         output_hidden_states: bool | None = False,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> BaseModelOutputWithPastAndCrossAttentions | tuple:
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
             The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.
@@ -502,7 +502,7 @@ def forward(
         output_hidden_states: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | Seq2SeqModelOutput:
+    ) -> Seq2SeqModelOutput:
         r"""
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
         or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
@@ -639,7 +639,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | Seq2SeqLMOutput:
+    ) -> Seq2SeqLMOutput:
         r"""
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
         or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index bd8c75b2a855..8beb18db62ef 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -390,7 +390,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> BaseModelOutput | tuple[torch.Tensor, ...]:
+    ) -> BaseModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`):
             Indices of input sequence tokens in the vocabulary.
@@ -481,7 +481,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> MaskedLMOutput | tuple[torch.Tensor, ...]:
+    ) -> MaskedLMOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`):
             Indices of input sequence tokens in the vocabulary.
@@ -575,7 +575,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> SequenceClassifierOutput | tuple[torch.Tensor, ...]:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -674,7 +674,7 @@ def forward(
         end_positions: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> QuestionAnsweringModelOutput | tuple[torch.Tensor, ...]:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`):
             Indices of input sequence tokens in the vocabulary.
@@ -773,7 +773,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> TokenClassifierOutput | tuple[torch.Tensor, ...]:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -848,7 +848,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> MultipleChoiceModelOutput | tuple[torch.Tensor, ...]:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
diff --git a/src/transformers/models/edgetam/modeling_edgetam.py b/src/transformers/models/edgetam/modeling_edgetam.py
index 34c59b50a8d4..bf5293911fdb 100644
--- a/src/transformers/models/edgetam/modeling_edgetam.py
+++ b/src/transformers/models/edgetam/modeling_edgetam.py
@@ -449,7 +449,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | EdgeTamVisionEncoderOutput:
+    ) -> EdgeTamVisionEncoderOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -1204,7 +1204,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | EdgeTamVisionEncoderOutput:
+    ) -> EdgeTamVisionEncoderOutput:
         r"""
         pixel_values (`torch.FloatTensor`):
             Input pixel values of shape `(batch_size, num_channels, height, width)`.
diff --git a/src/transformers/models/edgetam/modular_edgetam.py b/src/transformers/models/edgetam/modular_edgetam.py
index 7113d1a8dadc..c00b74c9c83c 100644
--- a/src/transformers/models/edgetam/modular_edgetam.py
+++ b/src/transformers/models/edgetam/modular_edgetam.py
@@ -195,7 +195,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | EdgeTamVisionEncoderOutput:
+    ) -> EdgeTamVisionEncoderOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
diff --git a/src/transformers/models/edgetam_video/modeling_edgetam_video.py b/src/transformers/models/edgetam_video/modeling_edgetam_video.py
index ef8623eb3bfe..e52d67cfa4fb 100644
--- a/src/transformers/models/edgetam_video/modeling_edgetam_video.py
+++ b/src/transformers/models/edgetam_video/modeling_edgetam_video.py
@@ -2235,7 +2235,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | EdgeTamVideoVisionEncoderOutput:
+    ) -> EdgeTamVideoVisionEncoderOutput:
         r"""
         pixel_values (`torch.FloatTensor`):
             Input pixel values of shape `(batch_size, num_channels, height, width)`.
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 2fe4c2bea9b9..6a6b8168c692 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -594,7 +594,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithCrossAttentions:
+    ) -> BaseModelOutputWithCrossAttentions:
         if self.config.is_decoder:
             use_cache = use_cache if use_cache is not None else self.config.use_cache
         else:
@@ -844,7 +844,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -922,7 +922,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | ElectraForPreTrainingOutput:
+    ) -> ElectraForPreTrainingOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring)
@@ -1024,7 +1024,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -1091,7 +1091,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -1150,7 +1150,7 @@ def forward(
         start_positions: torch.Tensor | None = None,
         end_positions: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         discriminator_hidden_states = self.electra(
             input_ids,
             attention_mask=attention_mask,
@@ -1217,7 +1217,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1332,7 +1332,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py
index 2d41f4aead43..96f92f5f4ae2 100644
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@@ -1403,7 +1403,7 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.L
     )
     def get_image_features(
         self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | Emu3VQVAEModelOutput:
+    ) -> Emu3VQVAEModelOutput:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
             The tensors corresponding to the input images.
@@ -1482,7 +1482,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
             The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
@@ -1563,7 +1563,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
             The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py
index ac4c79ae3cf2..7a75d4636267 100644
--- a/src/transformers/models/emu3/modular_emu3.py
+++ b/src/transformers/models/emu3/modular_emu3.py
@@ -957,7 +957,7 @@ def get_image_tokens(self, pixel_values: torch.FloatTensor, image_sizes: torch.L
     )
     def get_image_features(
         self, pixel_values: torch.FloatTensor, image_sizes: torch.LongTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | Emu3VQVAEModelOutput:
+    ) -> Emu3VQVAEModelOutput:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
             The tensors corresponding to the input images.
@@ -1036,7 +1036,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
             The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
@@ -1117,7 +1117,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
             The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 2df8cf56eb05..3f6f5895ff08 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -329,7 +329,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | Seq2SeqLMOutput:
+    ) -> Seq2SeqLMOutput:
         r"""
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index e300858722e5..2693da468327 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -612,7 +612,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -808,7 +808,7 @@ def forward(
         labels: torch.Tensor | None = None,
         next_sentence_label: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | ErnieForPreTrainingOutput:
+    ) -> ErnieForPreTrainingOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -930,7 +930,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -1023,7 +1023,7 @@ def forward(
         encoder_attention_mask: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -1125,7 +1125,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | NextSentencePredictorOutput:
+    ) -> NextSentencePredictorOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -1220,7 +1220,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -1306,7 +1306,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1412,7 +1412,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -1476,7 +1476,7 @@ def forward(
         start_positions: torch.Tensor | None = None,
         end_positions: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
diff --git a/src/transformers/models/ernie/modular_ernie.py b/src/transformers/models/ernie/modular_ernie.py
index de913662c91d..8f062cfa6c73 100644
--- a/src/transformers/models/ernie/modular_ernie.py
+++ b/src/transformers/models/ernie/modular_ernie.py
@@ -205,7 +205,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -347,7 +347,7 @@ def forward(
         labels: torch.Tensor | None = None,
         next_sentence_label: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | ErnieForPreTrainingOutput:
+    ) -> ErnieForPreTrainingOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -430,7 +430,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -500,7 +500,7 @@ def forward(
         encoder_attention_mask: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -554,7 +554,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | NextSentencePredictorOutput:
+    ) -> NextSentencePredictorOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -628,7 +628,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -700,7 +700,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -791,7 +791,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
@@ -844,7 +844,7 @@ def forward(
         start_positions: torch.Tensor | None = None,
         end_positions: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         task_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Task type embedding is a special embedding to represent the characteristic of different tasks, such as
diff --git a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py
index 3b02f84c8d84..d9d2f83f963e 100644
--- a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py
@@ -936,7 +936,7 @@ def rot_pos_emb(self, grid_thw):
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         grid_thw (`torch.LongTensor` of shape `(num_images, 3)`):
             The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values.
@@ -1273,7 +1273,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1298,7 +1298,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1372,7 +1372,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         r"""
         mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2).
@@ -1663,7 +1663,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoeCausalLMOutputWithPast:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
         mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2).
diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
index 1093d0d9edc9..8b32a5d83877 100644
--- a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
@@ -891,7 +891,7 @@ def get_device(self):
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         hidden_states = self.patch_embed(hidden_states)
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
         emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
@@ -1275,7 +1275,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         video_outputs = self.vision_tower(pixel_values_videos, video_grid_thw, return_dict=True, **kwargs)
         video_embeds = self.resampler_model(video_outputs.last_hidden_state, video_grid_thw)
         split_sizes = (
@@ -1294,7 +1294,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_tower(pixel_values, image_grid_thw, return_dict=True, **kwargs)
         image_embeds = self.resampler_model(image_outputs.last_hidden_state, image_grid_thw)
         split_sizes = (image_grid_thw.prod(-1) // self.vision_tower.spatial_merge_size**2).tolist()
@@ -1321,7 +1321,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         r"""
         mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2).
@@ -1475,7 +1475,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoeCausalLMOutputWithPast:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
         mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2).
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index 6a95eefa494d..f1a33fb16687 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -498,7 +498,7 @@ def forward(
         encoder_hidden_states=None,
         encoder_attention_mask=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> BaseModelOutputWithCrossAttentions:
         for i, layer_module in enumerate(self.layer):
             hidden_states = layer_module(
                 hidden_states,
@@ -620,7 +620,7 @@ def forward(
         encoder_hidden_states: torch.Tensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
             Indices of input sequence tokens in the vocabulary.
@@ -756,7 +756,7 @@ def forward(
         encoder_attention_mask: torch.Tensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -842,7 +842,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -915,7 +915,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py
index 7961ea75cd3f..746b06163868 100644
--- a/src/transformers/models/evolla/modeling_evolla.py
+++ b/src/transformers/models/evolla/modeling_evolla.py
@@ -470,7 +470,7 @@ def forward(
         encoder_hidden_states=None,
         encoder_attention_mask=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> BaseModelOutputWithCrossAttentions:
         for i, layer_module in enumerate(self.layer):
             hidden_states = layer_module(
                 hidden_states,
@@ -544,7 +544,7 @@ def forward(
         input_ids: torch.Tensor | None,
         attention_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         input_shape = input_ids.size()
         batch_size, seq_length = input_shape
 
@@ -700,7 +700,9 @@ def __init__(self, config: EvollaConfig):
         self.sequence_compressor_resampler = EvollaSequenceCompressorResampler(config=config)
 
     @can_return_tuple
-    def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs):
+    def forward(
+        self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs
+    ) -> EvollaProteinEncoderModelOutput:
         protein_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
         protein_embeds = protein_output.last_hidden_state
         sequence_repr = self.sequence_compressor_resampler(protein_embeds, attention_mask)
@@ -1319,7 +1321,7 @@ def forward(
         structure_batch_mask: torch.Tensor | None = None,
         msa_batch_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         protein_input_ids (torch.LongTensor):
             The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
@@ -1430,7 +1432,7 @@ def forward(
         use_cache: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ):
+    ) -> CausalLMOutputWithPast:
         r"""
         protein_input_ids (torch.LongTensor):
             The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
diff --git a/src/transformers/models/evolla/modular_evolla.py b/src/transformers/models/evolla/modular_evolla.py
index c60de607e445..23d0d1a14b12 100644
--- a/src/transformers/models/evolla/modular_evolla.py
+++ b/src/transformers/models/evolla/modular_evolla.py
@@ -228,7 +228,7 @@ def forward(
         input_ids: torch.Tensor | None,
         attention_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         input_shape = input_ids.size()
         batch_size, seq_length = input_shape
 
@@ -384,7 +384,9 @@ def __init__(self, config: EvollaConfig):
         self.sequence_compressor_resampler = EvollaSequenceCompressorResampler(config=config)
 
     @can_return_tuple
-    def forward(self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs):
+    def forward(
+        self, input_ids: torch.LongTensor, attention_mask: torch.FloatTensor, **kwargs
+    ) -> EvollaProteinEncoderModelOutput:
         protein_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
         protein_embeds = protein_output.last_hidden_state
         sequence_repr = self.sequence_compressor_resampler(protein_embeds, attention_mask)
@@ -782,7 +784,7 @@ def forward(
         structure_batch_mask: torch.Tensor | None = None,
         msa_batch_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         protein_input_ids (torch.LongTensor):
             The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
@@ -893,7 +895,7 @@ def forward(
         use_cache: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ):
+    ) -> CausalLMOutputWithPast:
         r"""
         protein_input_ids (torch.LongTensor):
             The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
diff --git a/src/transformers/models/exaone4/modeling_exaone4.py b/src/transformers/models/exaone4/modeling_exaone4.py
index 7e87fbf5a337..3d4f26193a24 100644
--- a/src/transformers/models/exaone4/modeling_exaone4.py
+++ b/src/transformers/models/exaone4/modeling_exaone4.py
@@ -390,7 +390,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py
index abab32e8ee8b..fd3d6a1ebdc7 100644
--- a/src/transformers/models/exaone4/modular_exaone4.py
+++ b/src/transformers/models/exaone4/modular_exaone4.py
@@ -339,7 +339,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py
index 3bf6616b5453..8a90ba266bf8 100644
--- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py
+++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py
@@ -1289,7 +1289,7 @@ def forward(
         output_hidden_states: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,  # NOOP kwargs, for now
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1529,7 +1529,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         Example:
 
diff --git a/src/transformers/models/falcon_h1/modular_falcon_h1.py b/src/transformers/models/falcon_h1/modular_falcon_h1.py
index 12470ca974bd..442684385204 100644
--- a/src/transformers/models/falcon_h1/modular_falcon_h1.py
+++ b/src/transformers/models/falcon_h1/modular_falcon_h1.py
@@ -1015,7 +1015,7 @@ def forward(
         output_hidden_states: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,  # NOOP kwargs, for now
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1239,7 +1239,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         Example:
 
diff --git a/src/transformers/models/fast_vlm/modeling_fast_vlm.py b/src/transformers/models/fast_vlm/modeling_fast_vlm.py
index e2e2f2bb90b7..19f921ff1f37 100644
--- a/src/transformers/models/fast_vlm/modeling_fast_vlm.py
+++ b/src/transformers/models/fast_vlm/modeling_fast_vlm.py
@@ -124,7 +124,7 @@ def get_image_features(
         vision_feature_layer: int | list[int] | None = None,
         vision_feature_select_strategy: str | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
             The tensors corresponding to the input images.
@@ -183,7 +183,7 @@ def forward(
         vision_feature_select_strategy: str | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | FastVlmModelOutputWithPast:
+    ) -> FastVlmModelOutputWithPast:
         r"""
         vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
             The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
@@ -313,7 +313,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | FastVlmCausalLMOutputWithPast:
+    ) -> FastVlmCausalLMOutputWithPast:
         r"""
         vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
             The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
diff --git a/src/transformers/models/fast_vlm/modular_fast_vlm.py b/src/transformers/models/fast_vlm/modular_fast_vlm.py
index fbe891b334fd..b77e8cc5f559 100644
--- a/src/transformers/models/fast_vlm/modular_fast_vlm.py
+++ b/src/transformers/models/fast_vlm/modular_fast_vlm.py
@@ -180,7 +180,7 @@ def get_image_features(
         vision_feature_layer: int | list[int] | None = None,
         vision_feature_select_strategy: str | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
             The tensors corresponding to the input images.
@@ -215,7 +215,7 @@ def forward(
         vision_feature_select_strategy: str | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | FastVlmModelOutputWithPast:
+    ) -> FastVlmModelOutputWithPast:
         r"""
         vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
             The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
@@ -288,7 +288,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | FastVlmCausalLMOutputWithPast:
+    ) -> FastVlmCausalLMOutputWithPast:
         r"""
         vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
             The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index 87f7766e6631..2ffd53a5d8e0 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -1007,7 +1007,7 @@ def get_text_features(
         token_type_ids: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
             Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
@@ -1058,7 +1058,7 @@ def get_image_features(
         interpolate_pos_encoding: bool | None = None,
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index f4fe735ce4f5..8953e2d84397 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -554,9 +554,7 @@ def __init__(self, config: Florence2VisionConfig):
         self.post_init()
 
     @check_model_inputs
-    def forward(
-        self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling:
         for conv, block in zip(self.convs, self.blocks):
             hidden_states = conv(hidden_states)
             for layer in block:
@@ -688,7 +686,7 @@ def set_input_embeddings(self, value):
     )
     def get_image_features(
         self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
             The tensors corresponding to the input images.
@@ -741,7 +739,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | Florence2Seq2SeqModelOutput:
+    ) -> Florence2Seq2SeqModelOutput:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -878,7 +876,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Florence2Seq2SeqLMOutput:
+    ) -> Florence2Seq2SeqLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py
index f94974769f7a..6de41683260e 100644
--- a/src/transformers/models/florence2/modular_florence2.py
+++ b/src/transformers/models/florence2/modular_florence2.py
@@ -1400,9 +1400,7 @@ def __init__(self, config: Florence2VisionConfig):
         self.post_init()
 
     @check_model_inputs
-    def forward(
-        self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling:
         for conv, block in zip(self.convs, self.blocks):
             hidden_states = conv(hidden_states)
             for layer in block:
@@ -1520,7 +1518,7 @@ def get_encoder(self, modality=None):
     )
     def get_image_features(
         self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
             The tensors corresponding to the input images.
@@ -1549,7 +1547,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | Florence2Seq2SeqModelOutput:
+    ) -> Florence2Seq2SeqModelOutput:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1649,7 +1647,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Florence2Seq2SeqLMOutput:
+    ) -> Florence2Seq2SeqLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py
index 63f7be1f27b7..a470a10b082d 100644
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@@ -118,7 +118,7 @@ def gather_continuous_embeddings(
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -269,7 +269,7 @@ def forward(
         return_dict: bool | None = None,
         logits_to_keep: int | None = 0,
         **kwargs,
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
             Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py
index 9893d7a0f9a8..77ca169d8107 100644
--- a/src/transformers/models/gemma3/modeling_gemma3.py
+++ b/src/transformers/models/gemma3/modeling_gemma3.py
@@ -843,7 +843,7 @@ def set_input_embeddings(self, value):
     @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.")
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs)
         last_hidden_state = vision_outputs.last_hidden_state
         vision_outputs.pooler_output = self.multi_modal_projector(last_hidden_state)
@@ -889,7 +889,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         use_cache: bool | None = None,
         **lm_kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Gemma3ModelOutputWithPast:
+    ) -> Gemma3ModelOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -1029,7 +1029,7 @@ def forward(
         use_cache: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **lm_kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Gemma3CausalLMOutputWithPast:
+    ) -> Gemma3CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py
index 31424e010f76..91498e86d9b2 100644
--- a/src/transformers/models/gemma3/modular_gemma3.py
+++ b/src/transformers/models/gemma3/modular_gemma3.py
@@ -803,7 +803,7 @@ def __init__(self, config: Gemma3Config):
     @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.")
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs)
         last_hidden_state = vision_outputs.last_hidden_state
         vision_outputs.pooler_output = self.multi_modal_projector(last_hidden_state)
@@ -825,7 +825,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         use_cache: bool | None = None,
         **lm_kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Gemma3ModelOutputWithPast:
+    ) -> Gemma3ModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -910,7 +910,7 @@ def forward(
         use_cache: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **lm_kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Gemma3CausalLMOutputWithPast:
+    ) -> Gemma3CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py
index 500e770c3df1..a41c67649eb5 100644
--- a/src/transformers/models/gemma3n/modeling_gemma3n.py
+++ b/src/transformers/models/gemma3n/modeling_gemma3n.py
@@ -944,7 +944,7 @@ def __init__(self, config: Gemma3nAudioConfig):
     @check_model_inputs
     def forward(
         self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | Gemma3nAudioEncoderModelOutput:
+    ) -> Gemma3nAudioEncoderModelOutput:
         """Encodes a batch of MELs.
 
         Args:
@@ -1952,7 +1952,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         vision_outputs = self.vision_tower(pixel_values=pixel_values, do_pooling=False, return_dict=True, **kwargs)
         last_hidden_state = vision_outputs.last_hidden_state
         # Convert from (batch, channels, height, width) to (batch, height * width, channels) where:
@@ -2163,7 +2163,7 @@ def get_audio_features(
         input_features: torch.Tensor,
         input_features_mask: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Gemma3nAudioEncoderModelOutput:
+    ) -> Gemma3nAudioEncoderModelOutput:
         r"""
         input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
             The tensors corresponding to the input audio.
diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py
index fd95a8b37366..89e6fc8dd79d 100644
--- a/src/transformers/models/gemma3n/modular_gemma3n.py
+++ b/src/transformers/models/gemma3n/modular_gemma3n.py
@@ -1501,7 +1501,7 @@ def __init__(self, config: Gemma3nAudioConfig):
     @check_model_inputs
     def forward(
         self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | Gemma3nAudioEncoderModelOutput:
+    ) -> Gemma3nAudioEncoderModelOutput:
         """Encodes a batch of MELs.
 
         Args:
@@ -2219,7 +2219,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         vision_outputs = self.vision_tower(pixel_values=pixel_values, do_pooling=False, return_dict=True, **kwargs)
         last_hidden_state = vision_outputs.last_hidden_state
         # Convert from (batch, channels, height, width) to (batch, height * width, channels) where:
@@ -2430,7 +2430,7 @@ def get_audio_features(
         input_features: torch.Tensor,
         input_features_mask: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Gemma3nAudioEncoderModelOutput:
+    ) -> Gemma3nAudioEncoderModelOutput:
         r"""
         input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
             The tensors corresponding to the input audio.
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 056442142cc0..566d6fcde849 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -777,7 +777,7 @@ def forward(
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py
index dd11e0d4dfe0..dcf61cf09648 100644
--- a/src/transformers/models/glm4/modeling_glm4.py
+++ b/src/transformers/models/glm4/modeling_glm4.py
@@ -481,7 +481,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/glm4/modular_glm4.py b/src/transformers/models/glm4/modular_glm4.py
index b9ec811fbc3d..f9feb89995e2 100644
--- a/src/transformers/models/glm4/modular_glm4.py
+++ b/src/transformers/models/glm4/modular_glm4.py
@@ -92,7 +92,7 @@ class Glm4ForCausalLM(GlmForCausalLM):
     def forward(
         self,
         **super_kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/glm46v/modeling_glm46v.py b/src/transformers/models/glm46v/modeling_glm46v.py
index de7b79923826..d7f7fca9d4a0 100644
--- a/src/transformers/models/glm46v/modeling_glm46v.py
+++ b/src/transformers/models/glm46v/modeling_glm46v.py
@@ -303,7 +303,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -333,7 +333,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -406,7 +406,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Glm46VModelOutputWithPast:
+    ) -> Glm46VModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -595,7 +595,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Glm46VCausalLMOutputWithPast:
+    ) -> Glm46VCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py
index fb6de9c68dc6..e2275eb86b5c 100644
--- a/src/transformers/models/glm4v/modeling_glm4v.py
+++ b/src/transformers/models/glm4v/modeling_glm4v.py
@@ -767,7 +767,7 @@ def rot_pos_emb(self, grid_thw):
     @auto_docstring
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
             The final hidden states of the model.
@@ -858,7 +858,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1153,7 +1153,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1183,7 +1183,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1256,7 +1256,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Glm4vModelOutputWithPast:
+    ) -> Glm4vModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -1445,7 +1445,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Glm4vCausalLMOutputWithPast:
+    ) -> Glm4vCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py
index 0f1d57404d50..97d22db4b5e9 100644
--- a/src/transformers/models/glm4v/modular_glm4v.py
+++ b/src/transformers/models/glm4v/modular_glm4v.py
@@ -786,7 +786,7 @@ def rot_pos_emb(self, grid_thw):
     @auto_docstring
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
             The final hidden states of the model.
@@ -867,7 +867,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1146,7 +1146,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1227,7 +1227,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Glm4vModelOutputWithPast:
+    ) -> Glm4vModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -1341,7 +1341,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Glm4vCausalLMOutputWithPast:
+    ) -> Glm4vCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
index bc84e5b83e00..d3ac4b991ffa 100644
--- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
+++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
@@ -833,7 +833,7 @@ def rot_pos_emb(self, grid_thw):
     @auto_docstring
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
             The final hidden states of the model.
@@ -1001,7 +1001,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1322,7 +1322,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1352,7 +1352,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1425,7 +1425,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Glm4vMoeModelOutputWithPast:
+    ) -> Glm4vMoeModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -1667,7 +1667,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Glm4vMoeCausalLMOutputWithPast:
+    ) -> Glm4vMoeCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py
index dd59ae412916..75fea0c4f71e 100644
--- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py
+++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py
@@ -415,7 +415,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -509,7 +509,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Glm4vMoeCausalLMOutputWithPast:
+    ) -> Glm4vMoeCausalLMOutputWithPast:
         outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
diff --git a/src/transformers/models/glm_image/modeling_glm_image.py b/src/transformers/models/glm_image/modeling_glm_image.py
index b694814e82df..33d0a6ac6965 100644
--- a/src/transformers/models/glm_image/modeling_glm_image.py
+++ b/src/transformers/models/glm_image/modeling_glm_image.py
@@ -720,7 +720,7 @@ def rot_pos_emb(self, grid_thw):
     @auto_docstring
     def forward(
         self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`):
             Packed pixel values.
@@ -868,7 +868,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1200,7 +1200,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1260,7 +1260,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | GlmImageModelOutputWithPast:
+    ) -> GlmImageModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
diff --git a/src/transformers/models/glm_image/modular_glm_image.py b/src/transformers/models/glm_image/modular_glm_image.py
index d2f4604a4e72..14fff7a191c1 100644
--- a/src/transformers/models/glm_image/modular_glm_image.py
+++ b/src/transformers/models/glm_image/modular_glm_image.py
@@ -589,7 +589,7 @@ def rot_pos_emb(self, grid_thw):
     @auto_docstring
     def forward(
         self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`):
             Packed pixel values.
@@ -909,7 +909,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -967,7 +967,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | GlmImageModelOutputWithPast:
+    ) -> GlmImageModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
diff --git a/src/transformers/models/glmasr/modeling_glmasr.py b/src/transformers/models/glmasr/modeling_glmasr.py
index 9a08c0ec1adb..7001d8848a00 100644
--- a/src/transformers/models/glmasr/modeling_glmasr.py
+++ b/src/transformers/models/glmasr/modeling_glmasr.py
@@ -312,7 +312,7 @@ def __init__(self, config: GlmAsrEncoderConfig):
 
     @check_model_inputs
     @auto_docstring
-    def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]):
+    def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling:
         inputs_embeds = nn.functional.gelu(self.conv1(input_features))
         inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
         inputs_embeds = inputs_embeds.transpose(1, 2)
@@ -395,7 +395,7 @@ def get_audio_features(
         input_features: torch.FloatTensor,
         input_features_mask: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
diff --git a/src/transformers/models/glmasr/modular_glmasr.py b/src/transformers/models/glmasr/modular_glmasr.py
index a81a6ed7748d..ce2bfc505219 100644
--- a/src/transformers/models/glmasr/modular_glmasr.py
+++ b/src/transformers/models/glmasr/modular_glmasr.py
@@ -325,7 +325,7 @@ def __init__(self, config: GlmAsrEncoderConfig):
 
     @check_model_inputs
     @auto_docstring
-    def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]):
+    def forward(self, input_features, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling:
         inputs_embeds = nn.functional.gelu(self.conv1(input_features))
         inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
         inputs_embeds = inputs_embeds.transpose(1, 2)
@@ -364,7 +364,7 @@ def get_audio_features(
         input_features: torch.FloatTensor,
         input_features_mask: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         audio_outputs = self.audio_tower(input_features, return_dict=True, **kwargs)
         audio_hidden_states = audio_outputs.last_hidden_state
         audio_hidden_states = audio_hidden_states.reshape(
diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py
index e5f2d747cf1c..2b5de980c59d 100644
--- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py
@@ -437,7 +437,7 @@ def get_input_embeddings(self):
     @check_model_inputs(tie_last_hidden_states=False)
     def forward(
         self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | GotOcr2VisionEncoderOutput:
+    ) -> GotOcr2VisionEncoderOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -556,7 +556,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs)
         last_hidden_state = image_outputs.last_hidden_state
         image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state)
@@ -603,7 +603,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | GotOcr2ModelOutputWithPast:
+    ) -> GotOcr2ModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -701,7 +701,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | GotOcr2CausalLMOutputWithPast:
+    ) -> GotOcr2CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py
index cf7a3e584549..af45aae43c88 100644
--- a/src/transformers/models/got_ocr2/modular_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py
@@ -308,7 +308,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs)
         last_hidden_state = image_outputs.last_hidden_state
         image_outputs.pooler_output = self.multi_modal_projector(last_hidden_state)
@@ -329,7 +329,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | GotOcr2ModelOutputWithPast:
+    ) -> GotOcr2ModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -393,7 +393,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | GotOcr2CausalLMOutputWithPast:
+    ) -> GotOcr2CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 4b89bcc9e140..41efb636438c 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -416,7 +416,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPastAndCrossAttentions:
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         r"""
         input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
             `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 2d9e16419e81..5270f2af55f3 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -551,7 +551,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py
index b157af6a1bd2..9cbff5a6787f 100644
--- a/src/transformers/models/gpt_neox/modular_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py
@@ -424,7 +424,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 5f6bf6461141..38775edcf92e 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -297,9 +297,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
         self.post_init()
 
     @check_model_inputs
-    def forward(
-        self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling:
         hidden_states = self.input_linear(hidden_states)
         for idx, layer in enumerate(self.layers, start=1):
             hidden_states = layer(hidden_states, attention_dists=self.attention_dists)
@@ -361,7 +359,7 @@ def get_output_embeddings(self):
     @auto_docstring
     def get_audio_features(
         self, input_features: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         audio_outputs = self.encoder(input_features, return_dict=True, **kwargs)
         projected_embeds = self.projector(audio_outputs.last_hidden_state)
         audio_outputs.pooler_output = projected_embeds
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
index 527b5251d3be..ee2a959f7eca 100644
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -666,7 +666,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | MoeCausalLMOutputWithPast:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/granitemoe/modular_granitemoe.py b/src/transformers/models/granitemoe/modular_granitemoe.py
index 6dc38232ec63..8ba86eb96e58 100644
--- a/src/transformers/models/granitemoe/modular_granitemoe.py
+++ b/src/transformers/models/granitemoe/modular_granitemoe.py
@@ -247,7 +247,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | MoeCausalLMOutputWithPast:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
index 60291ecefbe8..fcb7d1a71226 100644
--- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
+++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
@@ -1287,7 +1287,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[GraniteFlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1470,7 +1470,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | MoeCausalLMOutputWithPast:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py
index f1930a4cca75..a11121d4d355 100644
--- a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py
+++ b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py
@@ -221,7 +221,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[GraniteFlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
index 3f177aa2475c..6dc665746d21 100644
--- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
+++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
@@ -735,7 +735,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | MoeCausalLMOutputWithPast:
+    ) -> MoeCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index c7b9870cd606..467b7c7ea9c4 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -1236,7 +1236,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1269,7 +1269,7 @@ def get_image_features(
         self,
         pixel_values: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index fa23e47bf4d6..220209a8a25e 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -956,7 +956,7 @@ def forward(
         interpolate_pos_encoding: bool | None = False,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | IdeficsBaseModelOutputWithPast:
+    ) -> IdeficsBaseModelOutputWithPast:
         r"""
         image_encoder_embeddings (`torch.FloatTensor`, *optional*):
             The output of the image encoder.
@@ -1144,7 +1144,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | IdeficsCausalLMOutputWithPast:
+    ) -> IdeficsCausalLMOutputWithPast:
         r"""
         image_encoder_embeddings (`torch.FloatTensor`, *optional*):
             The output of the image encoder.
diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py
index 91bdb78c3bed..23026335fe3d 100644
--- a/src/transformers/models/idefics/vision.py
+++ b/src/transformers/models/idefics/vision.py
@@ -354,7 +354,7 @@ def forward(
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index e1ef275d519a..5e51cc21f140 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -469,7 +469,7 @@ def forward(
         pixel_values,
         patch_attention_mask: torch.BoolTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         patch_attention_mask (`torch.BoolTensor` of shape `(batch_size, num_patches_height, num_patches_width)`, *optional*):
             The attention mask for the patches.
@@ -834,7 +834,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         pixel_attention_mask: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -906,7 +906,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | Idefics2BaseModelOutputWithPast:
+    ) -> Idefics2BaseModelOutputWithPast:
         r"""
         pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
             Mask to avoid performing attention on padding pixel indices.
@@ -1035,7 +1035,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Idefics2CausalLMOutputWithPast:
+    ) -> Idefics2CausalLMOutputWithPast:
         r"""
         pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
             Mask to avoid performing attention on padding pixel indices.
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index d1ac39b42115..c9b2ffed9cc8 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -476,7 +476,7 @@ def forward(
         pixel_values,
         patch_attention_mask: torch.BoolTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         batch_size = pixel_values.size(0)
         if patch_attention_mask is None:
             patch_size = self.patch_size
@@ -577,7 +577,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         pixel_attention_mask: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -650,7 +650,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | Idefics3BaseModelOutputWithPast:
+    ) -> Idefics3BaseModelOutputWithPast:
         r"""
         pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
             Mask to avoid performing attention on padding pixel indices.
@@ -790,7 +790,7 @@ def forward(
         return_dict: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Idefics3CausalLMOutputWithPast:
+    ) -> Idefics3CausalLMOutputWithPast:
         r"""
         pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
             Mask to avoid performing attention on padding pixel indices.
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 78ab67d2c09a..397f4b5b24c2 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -414,7 +414,7 @@ def forward(
         pixel_values: torch.FloatTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -713,7 +713,7 @@ def forward(
         encoder_attention_mask=None,
         query_length=0,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         for i in range(self.config.num_hidden_layers):
             layer_module = self.layer[i]
 
@@ -872,7 +872,7 @@ def forward(
         encoder_hidden_states: torch.FloatTensor | None = None,
         encoder_attention_mask: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
             Hidden states to be used in the attention computation. If cross-attention,
@@ -1019,7 +1019,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | InstructBlipForConditionalGenerationModelOutput:
+    ) -> InstructBlipForConditionalGenerationModelOutput:
         r"""
         qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
@@ -1197,7 +1197,7 @@ def get_image_features(
         qformer_attention_mask: torch.LongTensor | None = None,
         interpolate_pos_encoding: bool | None = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithVisionQformerOutputs:
+    ) -> BaseModelOutputWithVisionQformerOutputs:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1285,7 +1285,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | InstructBlipForConditionalGenerationModelOutput:
+    ) -> InstructBlipForConditionalGenerationModelOutput:
         r"""
         qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index 121db617af5c..7c8af79e7dc1 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -419,7 +419,7 @@ def forward(
         pixel_values: torch.FloatTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -713,7 +713,7 @@ def forward(
         encoder_attention_mask=None,
         query_length=0,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         for i in range(self.config.num_hidden_layers):
             layer_module = self.layer[i]
 
@@ -822,7 +822,7 @@ def forward(
         encoder_hidden_states: torch.FloatTensor | None = None,
         encoder_attention_mask: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
             Hidden states to be used in the attention computation. If cross-attention,
@@ -1008,7 +1008,7 @@ def forward(
         interpolate_pos_encoding: bool = False,
         use_cache: bool | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput:
+    ) -> InstructBlipVideoForConditionalGenerationModelOutput:
         r"""
         qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
@@ -1253,7 +1253,7 @@ def forward(
         interpolate_pos_encoding: bool = False,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput:
+    ) -> InstructBlipVideoForConditionalGenerationModelOutput:
         r"""
         qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
             The sequence used as a prompt to be fed to the Q-Former module.
@@ -1462,7 +1462,7 @@ def get_video_features(
         qformer_attention_mask: torch.LongTensor | None = None,
         interpolate_pos_encoding: bool | None = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithVisionQformerOutputs:
+    ) -> BaseModelOutputWithVisionQformerOutputs:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
index 8b76bfc6fba1..6e0a58f12439 100644
--- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -192,7 +192,7 @@ def forward(
         interpolate_pos_encoding: bool = False,
         use_cache: bool | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput:
+    ) -> InstructBlipVideoForConditionalGenerationModelOutput:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # step 1: forward the images through the vision encoder,
@@ -294,7 +294,7 @@ def get_video_features(
         qformer_attention_mask: torch.LongTensor | None = None,
         interpolate_pos_encoding: bool | None = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithVisionQformerOutputs:
+    ) -> BaseModelOutputWithVisionQformerOutputs:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -393,7 +393,7 @@ def forward(
         interpolate_pos_encoding: bool = False,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | InstructBlipVideoForConditionalGenerationModelOutput:
+    ) -> InstructBlipVideoForConditionalGenerationModelOutput:
         r"""
         qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
             The sequence used as a prompt to be fed to the Q-Former module.
diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py
index 498caae1044e..95e972c2c566 100644
--- a/src/transformers/models/internvl/modeling_internvl.py
+++ b/src/transformers/models/internvl/modeling_internvl.py
@@ -447,7 +447,7 @@ def get_input_embeddings(self):
     @auto_docstring
     def forward(
         self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor | None = None, **kwargs
-    ) -> tuple | InternVLVisionModelOutputWithPooling:
+    ) -> InternVLVisionModelOutputWithPooling:
         r"""
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
@@ -554,7 +554,7 @@ def get_image_features(
         vision_feature_layer: int | list[int] | None = None,
         vision_feature_select_strategy: str | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
             The tensors corresponding to the input images.
@@ -632,7 +632,7 @@ def forward(
         vision_feature_select_strategy: str | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | InternVLModelOutputWithPast:
+    ) -> InternVLModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -796,7 +796,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | InternVLCausalLMOutputWithPast:
+    ) -> InternVLCausalLMOutputWithPast:
         r"""
         Example:
 
diff --git a/src/transformers/models/internvl/modular_internvl.py b/src/transformers/models/internvl/modular_internvl.py
index d5ec73e498da..46736f29b547 100644
--- a/src/transformers/models/internvl/modular_internvl.py
+++ b/src/transformers/models/internvl/modular_internvl.py
@@ -401,7 +401,7 @@ def get_input_embeddings(self):
     @auto_docstring
     def forward(
         self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor | None = None, **kwargs
-    ) -> tuple | InternVLVisionModelOutputWithPooling:
+    ) -> InternVLVisionModelOutputWithPooling:
         r"""
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
             Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
@@ -494,7 +494,7 @@ def get_image_features(
         vision_feature_layer: int | list[int] | None = None,
         vision_feature_select_strategy: str | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
             The tensors corresponding to the input images.
@@ -548,7 +548,7 @@ def forward(
         vision_feature_select_strategy: str | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | InternVLModelOutputWithPast:
+    ) -> InternVLModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py
index 7ef79803672f..b87bde2a9208 100644
--- a/src/transformers/models/janus/modeling_janus.py
+++ b/src/transformers/models/janus/modeling_janus.py
@@ -459,7 +459,7 @@ def forward(
         pixel_values: torch.FloatTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -1001,7 +1001,7 @@ def set_input_embeddings(self, value):
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         vision_outputs = self.vision_model(pixel_values, return_dict=True, **kwargs)
         vision_outputs.pooler_output = self.aligner(vision_outputs.last_hidden_state)
 
diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
index c636b69b47a4..3ea7811355ba 100644
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -566,7 +566,7 @@ def forward(
         pixel_values: torch.FloatTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -942,7 +942,7 @@ def set_input_embeddings(self, value):
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         vision_outputs = self.vision_model(pixel_values, return_dict=True, **kwargs)
         vision_outputs.pooler_output = self.aligner(vision_outputs.last_hidden_state)
 
diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py
index bcfff60f0b02..e26866c9ddf4 100644
--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@@ -448,7 +448,7 @@ def forward(
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -1264,7 +1264,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPastAndCrossAttentions:
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         r"""
         image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
@@ -1339,7 +1339,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
@@ -1514,7 +1514,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool | None = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithProjectionAttentions:
+    ) -> BaseModelOutputWithProjectionAttentions:
         if "return_attentions" in kwargs:
             warnings.warn(
                 "`return_attentions` is deprecated and will be removed in a future version. Please use `return_dict`"
@@ -1557,7 +1557,7 @@ def forward(
         interpolate_pos_encoding: bool = False,
         return_dict: bool | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | Kosmos2ModelOutput:
+    ) -> Kosmos2ModelOutput:
         r"""
         image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
@@ -1693,7 +1693,7 @@ def forward(
         output_hidden_states: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Kosmos2ForConditionalGenerationModelOutput:
+    ) -> Kosmos2ForConditionalGenerationModelOutput:
         r"""
         image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 677f6eb75859..c27593c29825 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -326,7 +326,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -466,7 +466,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
             Bounding boxes of each input sequence tokens. Selected in the range `[0,
@@ -602,7 +602,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
             Bounding boxes of each input sequence tokens. Selected in the range `[0,
@@ -719,7 +719,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
             Bounding boxes of each input sequence tokens. Selected in the range `[0,
@@ -854,7 +854,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
             Bounding boxes of each input sequence tokens. Selected in the range `[0,
@@ -968,7 +968,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
             Bounding boxes of each input sequence tokens. Selected in the range `[0,
diff --git a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py
index 5ff43fadaa10..902e868eacc4 100755
--- a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py
@@ -171,7 +171,7 @@ def get_image_features(
         spatial_shapes: torch.Tensor,
         pixel_attention_mask: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
             The tensors corresponding to the input images.
@@ -250,7 +250,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Lfm2VlModelOutputWithPast:
+    ) -> Lfm2VlModelOutputWithPast:
         r"""
         spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
             The spatial shapes of the input images.
@@ -361,7 +361,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Lfm2VlCausalLMOutputWithPast:
+    ) -> Lfm2VlCausalLMOutputWithPast:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`, *optional*):
             The input image tensors.
diff --git a/src/transformers/models/lfm2_vl/modular_lfm2_vl.py b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py
index 2ff5a055fff9..70abc69756a4 100644
--- a/src/transformers/models/lfm2_vl/modular_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py
@@ -102,7 +102,7 @@ def get_image_features(
         spatial_shapes: torch.Tensor,
         pixel_attention_mask: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
             The tensors corresponding to the input images.
@@ -181,7 +181,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Lfm2VlModelOutputWithPast:
+    ) -> Lfm2VlModelOutputWithPast:
         r"""
         spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
             The spatial shapes of the input images.
@@ -271,7 +271,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Lfm2VlCausalLMOutputWithPast:
+    ) -> Lfm2VlCausalLMOutputWithPast:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`, *optional*):
             The input image tensors.
diff --git a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py
index c95633254ae9..03918c4c59b2 100644
--- a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py
+++ b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py
@@ -175,7 +175,7 @@ def set_input_embeddings(self, value):
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.Tensor, image_sizes: torch.Tensor | list, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_encoder(pixel_values, image_sizes=image_sizes, return_dict=True, **kwargs)
         image_features = image_outputs.last_hidden_state
         image_features = self.vision_projection(image_features.squeeze(0), image_sizes)
@@ -229,7 +229,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LightOnOcrModelOutputWithPast:
+    ) -> LightOnOcrModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -353,7 +353,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LightOnOcrCausalLMOutputWithPast:
+    ) -> LightOnOcrCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/lighton_ocr/modular_lighton_ocr.py b/src/transformers/models/lighton_ocr/modular_lighton_ocr.py
index 310118ac1a4b..ea347fccb3ec 100644
--- a/src/transformers/models/lighton_ocr/modular_lighton_ocr.py
+++ b/src/transformers/models/lighton_ocr/modular_lighton_ocr.py
@@ -310,7 +310,7 @@ def __init__(self, config: LightOnOcrConfig):
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.Tensor, image_sizes: torch.Tensor | list, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_encoder(pixel_values, image_sizes=image_sizes, return_dict=True, **kwargs)
         image_features = image_outputs.last_hidden_state
         image_features = self.vision_projection(image_features.squeeze(0), image_sizes)
@@ -340,7 +340,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LightOnOcrModelOutputWithPast:
+    ) -> LightOnOcrModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py
index cbc8b42e9114..c1c29bf604fa 100644
--- a/src/transformers/models/llama4/modeling_llama4.py
+++ b/src/transformers/models/llama4/modeling_llama4.py
@@ -535,7 +535,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -625,7 +625,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -1214,7 +1214,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         vision_feature_select_strategy: str,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
             The tensors corresponding to the input images.
@@ -1267,7 +1267,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Llama4CausalLMOutputWithPast:
+    ) -> Llama4CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 9fbd58c786df..2431c2ab6020 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -157,7 +157,7 @@ def get_image_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
         image_outputs = self.vision_tower(
@@ -236,7 +236,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LlavaModelOutputWithPast:
+    ) -> LlavaModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -336,7 +336,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LlavaCausalLMOutputWithPast:
+    ) -> LlavaCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 311f958005b5..6be577f6fab8 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -356,7 +356,7 @@ def get_image_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
             The tensors corresponding to the input images.
@@ -460,7 +460,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | LlavaNextModelOutputWithPast:
+    ) -> LlavaNextModelOutputWithPast:
         r"""
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
             The feature selection strategy used to select the vision feature from the vision backbone.
@@ -603,7 +603,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LlavaNextCausalLMOutputWithPast:
+    ) -> LlavaNextCausalLMOutputWithPast:
         r"""
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
             The feature selection strategy used to select the vision feature from the vision backbone.
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 687899c8f5cd..90b7b7771e11 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -408,7 +408,7 @@ def get_image_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
             The tensors corresponding to the input images.
@@ -529,7 +529,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | LlavaNextVideoModelOutputWithPast:
+    ) -> LlavaNextVideoModelOutputWithPast:
         r"""
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
             The feature selection strategy used to select the vision feature from the vision backbone.
@@ -613,7 +613,7 @@ def get_video_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
             The tensors corresponding to the input video.
@@ -744,7 +744,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LlavaNextVideoCausalLMOutputWithPast:
+    ) -> LlavaNextVideoCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index 345798810cb3..1d5427eda671 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -282,7 +282,7 @@ def get_image_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
             The tensors corresponding to the input images.
@@ -353,7 +353,7 @@ def get_video_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
             The tensors corresponding to the input video.
@@ -454,7 +454,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | LlavaNextVideoModelOutputWithPast:
+    ) -> LlavaNextVideoModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -571,7 +571,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LlavaNextVideoCausalLMOutputWithPast:
+    ) -> LlavaNextVideoCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index 3c7931e2e4ce..d950a8c59256 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -367,7 +367,7 @@ def get_image_features(
         batch_num_images: torch.LongTensor | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         image_sizes (`torch.Tensor` of shape `(num_images, 2)`):
             Actual image size of each images (H, W).
@@ -493,7 +493,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | LlavaOnevisionModelOutputWithPast:
+    ) -> LlavaOnevisionModelOutputWithPast:
         r"""
         image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
             The sizes of the videos in the batch, being (height, width) for each frame in the video.
@@ -583,7 +583,7 @@ def get_video_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
             The tensors corresponding to the input video.
@@ -729,7 +729,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LlavaOnevisionCausalLMOutputWithPast:
+    ) -> LlavaOnevisionCausalLMOutputWithPast:
         r"""
         image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
             The sizes of the videos in the batch, being (height, width) for each frame in the video.
diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
index 6c49d02662f2..692d76a9fc45 100644
--- a/src/transformers/models/llava_onevision/modular_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -324,7 +324,7 @@ def get_image_features(
         batch_num_images: torch.LongTensor | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         image_sizes (`torch.Tensor` of shape `(num_images, 2)`):
             Actual image size of each images (H, W).
@@ -397,7 +397,7 @@ def get_video_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
             The tensors corresponding to the input video.
@@ -459,7 +459,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | LlavaOnevisionModelOutputWithPast:
+    ) -> LlavaOnevisionModelOutputWithPast:
         r"""
         image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
             The sizes of the videos in the batch, being (height, width) for each frame in the video.
@@ -565,7 +565,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | LlavaOnevisionCausalLMOutputWithPast:
+    ) -> LlavaOnevisionCausalLMOutputWithPast:
         r"""
         image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
             The sizes of the videos in the batch, being (height, width) for each frame in the video.
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index 0a01c4b556f4..c4a0fa5e6c80 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -474,7 +474,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -558,7 +558,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*):
             Tag IDs for each token in the input sequence, padded up to config.max_depth.
@@ -666,7 +666,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*):
             Tag IDs for each token in the input sequence, padded up to config.max_depth.
@@ -782,7 +782,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*):
             Tag IDs for each token in the input sequence, padded up to config.max_depth.
@@ -885,7 +885,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         xpath_tags_seq (`torch.LongTensor` of shape `(batch_size, sequence_length, config.max_depth)`, *optional*):
             Tag IDs for each token in the input sequence, padded up to config.max_depth.
diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py
index 2440e5fcb58f..5571259d128e 100644
--- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py
+++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py
@@ -799,7 +799,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -831,7 +831,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py
index 4fc1aa6e3d6a..d7eb26d99ab6 100644
--- a/src/transformers/models/metaclip_2/modular_metaclip_2.py
+++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py
@@ -563,7 +563,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -591,7 +591,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor | None = None,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/minimax/modeling_minimax.py b/src/transformers/models/minimax/modeling_minimax.py
index a2dff7e9401b..841c81694c8c 100644
--- a/src/transformers/models/minimax/modeling_minimax.py
+++ b/src/transformers/models/minimax/modeling_minimax.py
@@ -654,7 +654,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py
index c05e7db364f5..5e2f67bc8150 100644
--- a/src/transformers/models/minimax/modular_minimax.py
+++ b/src/transformers/models/minimax/modular_minimax.py
@@ -546,7 +546,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py
index 03112e6fb77d..76a7ff7b9582 100644
--- a/src/transformers/models/mistral3/modeling_mistral3.py
+++ b/src/transformers/models/mistral3/modeling_mistral3.py
@@ -226,7 +226,7 @@ def get_image_features(
         vision_feature_layer: int | list[int] | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
         image_outputs = self.vision_tower(
@@ -296,7 +296,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Mistral3ModelOutputWithPast:
+    ) -> Mistral3ModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -407,7 +407,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Mistral3CausalLMOutputWithPast:
+    ) -> Mistral3CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/mistral3/modular_mistral3.py b/src/transformers/models/mistral3/modular_mistral3.py
index 7ebf180b3e3e..2f16bac7b3cf 100644
--- a/src/transformers/models/mistral3/modular_mistral3.py
+++ b/src/transformers/models/mistral3/modular_mistral3.py
@@ -131,7 +131,7 @@ def get_image_features(
         vision_feature_layer: int | list[int] | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         kwargs = {k: v for k, v in kwargs.items() if v is not None}
         # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
         image_outputs = self.vision_tower(
@@ -177,7 +177,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Mistral3ModelOutputWithPast:
+    ) -> Mistral3ModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -258,7 +258,7 @@ def forward(
         logits_to_keep: int | torch.Tensor = 0,
         image_sizes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Mistral3CausalLMOutputWithPast:
+    ) -> Mistral3CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py
index a3f2498a9192..e0169855ad2c 100644
--- a/src/transformers/models/mlcd/modeling_mlcd.py
+++ b/src/transformers/models/mlcd/modeling_mlcd.py
@@ -528,7 +528,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Example:
 
diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py
index f572d2758716..040e7bab69b4 100644
--- a/src/transformers/models/mlcd/modular_mlcd.py
+++ b/src/transformers/models/mlcd/modular_mlcd.py
@@ -439,7 +439,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Example:
 
diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py
index 5716e6c3d6e5..cda5b30ce38b 100644
--- a/src/transformers/models/mllama/modeling_mllama.py
+++ b/src/transformers/models/mllama/modeling_mllama.py
@@ -1360,7 +1360,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         cross_attention_states (`torch.FloatTensor`, *optional*):
             Output of the vision model, used for cross-attention. This tensor contains the processed image features that
@@ -1618,7 +1618,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         aspect_ratio_mask (`torch.Tensor` of shape `(batch_size, max_num_images, max_num_tiles)`, *optional*):
             Mask to avoid performing attention on padding tiles. Mask values selected in `[0, 1]`:
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index fe25e0b139c8..bef2d879d5f6 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -622,7 +622,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -700,7 +700,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         next_sentence_label: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MobileBertForPreTrainingOutput:
+    ) -> MobileBertForPreTrainingOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -798,7 +798,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -867,7 +867,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | NextSentencePredictorOutput:
+    ) -> NextSentencePredictorOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
@@ -954,7 +954,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -1032,7 +1032,7 @@ def forward(
         start_positions: torch.Tensor | None = None,
         end_positions: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         outputs = self.mobilebert(
             input_ids,
             attention_mask=attention_mask,
@@ -1103,7 +1103,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1203,7 +1203,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
diff --git a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py
index f0a6d343b99c..af6581968093 100644
--- a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py
+++ b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py
@@ -484,7 +484,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor, ...] | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) == (inputs_embeds is None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -589,7 +589,7 @@ def forward(
         use_cache: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -692,7 +692,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | SequenceClassifierOutputWithPast:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py
index e4005404e556..6c3eec505d84 100644
--- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py
+++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py
@@ -535,7 +535,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor, ...] | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) == (inputs_embeds is None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -640,7 +640,7 @@ def forward(
         use_cache: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -743,7 +743,7 @@ def forward(
         labels: torch.LongTensor | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | SequenceClassifierOutputWithPast:
+    ) -> SequenceClassifierOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py
index 7d8731e0ddc0..001c0f735db8 100644
--- a/src/transformers/models/moonshine/modeling_moonshine.py
+++ b/src/transformers/models/moonshine/modeling_moonshine.py
@@ -556,7 +556,7 @@ def forward(
         input_values: torch.FloatTensor,
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         Args:
             input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
@@ -648,7 +648,7 @@ def forward(
         encoder_hidden_states: torch.FloatTensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py
index 6517494ffe7f..f0f15232d3f5 100644
--- a/src/transformers/models/moonshine/modular_moonshine.py
+++ b/src/transformers/models/moonshine/modular_moonshine.py
@@ -519,7 +519,7 @@ def forward(
         input_values: torch.FloatTensor,
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         Args:
             input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
@@ -601,7 +601,7 @@ def forward(
         encoder_hidden_states: torch.FloatTensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 2cc7aba473c6..ce36968f7ca2 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -719,7 +719,7 @@ def forward(
         attention_mask: torch.Tensor | None = None,
         inputs_embeds: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> MoEModelOutput:
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -808,7 +808,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPastAndCrossAttentions:
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -916,7 +916,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | Seq2SeqMoEModelOutput:
+    ) -> Seq2SeqMoEModelOutput:
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 input_ids=input_ids,
@@ -1088,7 +1088,7 @@ def forward(
         output_router_logits: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | Seq2SeqMoEOutput:
+    ) -> Seq2SeqMoEOutput:
         output_router_logits = (
             output_router_logits if output_router_logits is not None else self.config.output_router_logits
         )
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 771ea6b0efe8..b25f63c6f123 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -488,7 +488,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         Args:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -670,7 +670,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -737,7 +737,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/ovis2/modeling_ovis2.py b/src/transformers/models/ovis2/modeling_ovis2.py
index 57e5bcd305cb..8e5d57ff6372 100644
--- a/src/transformers/models/ovis2/modeling_ovis2.py
+++ b/src/transformers/models/ovis2/modeling_ovis2.py
@@ -343,7 +343,7 @@ def forward(
         pixel_values,
         attention_mask: torch.Tensor | None = None,
         **kwargs,
-    ):
+    ) -> BaseModelOutput:
         hidden_states = self.embeddings(pixel_values)
 
         encoder_outputs: BaseModelOutput = self.encoder(
@@ -421,7 +421,7 @@ def __init__(self, config: Ovis2VisionConfig):
     @check_model_inputs
     def forward(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures:
+    ) -> BaseModelOutputWithVisualIndicatorFeatures:
         outputs = self.transformer(pixel_values, **kwargs)
         last_hidden_state = outputs[0]
         if self.config.hidden_stride > 1:
@@ -493,7 +493,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures:
+    ) -> BaseModelOutputWithVisualIndicatorFeatures:
         image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs)
         image_features = image_outputs.pooler_output
         batch_size, img_seq_len, _ = image_features.shape
@@ -559,7 +559,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | Ovis2ModelOutputWithPast:
+    ) -> Ovis2ModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -666,7 +666,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | Ovis2CausalLMOutputWithPast:
+    ) -> Ovis2CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/ovis2/modular_ovis2.py b/src/transformers/models/ovis2/modular_ovis2.py
index aa659655fb49..995d37512c6a 100644
--- a/src/transformers/models/ovis2/modular_ovis2.py
+++ b/src/transformers/models/ovis2/modular_ovis2.py
@@ -136,7 +136,7 @@ def forward(
         pixel_values,
         attention_mask: torch.Tensor | None = None,
         **kwargs,
-    ):
+    ) -> BaseModelOutput:
         hidden_states = self.embeddings(pixel_values)
 
         encoder_outputs: BaseModelOutput = self.encoder(
@@ -204,7 +204,7 @@ def __init__(self, config: Ovis2VisionConfig):
     @check_model_inputs
     def forward(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures:
+    ) -> BaseModelOutputWithVisualIndicatorFeatures:
         outputs = self.transformer(pixel_values, **kwargs)
         last_hidden_state = outputs[0]
         if self.config.hidden_stride > 1:
@@ -265,7 +265,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithVisualIndicatorFeatures:
+    ) -> BaseModelOutputWithVisualIndicatorFeatures:
         image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs)
         image_features = image_outputs.pooler_output
         batch_size, img_seq_len, _ = image_features.shape
@@ -307,7 +307,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | Ovis2ModelOutputWithPast:
+    ) -> Ovis2ModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -402,7 +402,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs,
-    ) -> tuple | Ovis2CausalLMOutputWithPast:
+    ) -> Ovis2CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py
index ded895ebb814..1c2e4fe2411f 100644
--- a/src/transformers/models/owlv2/modeling_owlv2.py
+++ b/src/transformers/models/owlv2/modeling_owlv2.py
@@ -981,7 +981,7 @@ def get_text_features(
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
@@ -1020,7 +1020,7 @@ def get_image_features(
         pixel_values: torch.Tensor,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
         ```python
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index bbda1109fd3e..fe00c0cb0892 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -962,7 +962,7 @@ def get_text_features(
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
@@ -1001,7 +1001,7 @@ def get_image_features(
         pixel_values: torch.Tensor,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
         ```python
diff --git a/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py
index f813ac0c10ea..cf5ec6bf1d3f 100644
--- a/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py
+++ b/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py
@@ -958,7 +958,7 @@ def forward(
         cu_seqlens: torch.Tensor,
         image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]] | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         """
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, image_channels, patch_size, patch_size)`):
@@ -1208,7 +1208,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1276,7 +1276,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | PaddleOCRVLModelOutputWithPast:
+    ) -> PaddleOCRVLModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -1386,7 +1386,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | PaddleOCRVLCausalLMOutputWithPast:
+    ) -> PaddleOCRVLCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
index 6a320b66440d..6ca518535068 100644
--- a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
+++ b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
@@ -1077,7 +1077,7 @@ def forward(
         cu_seqlens: torch.Tensor,
         image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]] | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         """
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, image_channels, patch_size, patch_size)`):
@@ -1131,7 +1131,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1199,7 +1199,7 @@ def forward(
         rope_deltas: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | PaddleOCRVLModelOutputWithPast:
+    ) -> PaddleOCRVLModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -1283,7 +1283,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | PaddleOCRVLCausalLMOutputWithPast:
+    ) -> PaddleOCRVLCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index f5998f0607b6..a9059351fc02 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -264,7 +264,7 @@ def set_input_embeddings(self, value):
     )
     def get_image_features(
         self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_tower(pixel_values, return_dict=True, **kwargs)
         selected_image_feature = image_outputs.last_hidden_state
         image_features = self.multi_modal_projector(selected_image_feature)
@@ -315,7 +315,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | PaligemmaModelOutputWithPast:
+    ) -> PaligemmaModelOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -467,7 +467,7 @@ def forward(
         return_dict: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | PaliGemmaCausalLMOutputWithPast:
+    ) -> PaliGemmaCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/pe_audio/modeling_pe_audio.py b/src/transformers/models/pe_audio/modeling_pe_audio.py
index 948cd6e1fd16..0daf26ea1649 100644
--- a/src/transformers/models/pe_audio/modeling_pe_audio.py
+++ b/src/transformers/models/pe_audio/modeling_pe_audio.py
@@ -642,7 +642,7 @@ def forward(
         input_values: torch.Tensor,
         padding_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         inputs_embeds, padding_mask = self.embedder(input_values, padding_mask=padding_mask)
         inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask)
 
diff --git a/src/transformers/models/pe_audio/modular_pe_audio.py b/src/transformers/models/pe_audio/modular_pe_audio.py
index 84a1ad80df85..b5e594393344 100644
--- a/src/transformers/models/pe_audio/modular_pe_audio.py
+++ b/src/transformers/models/pe_audio/modular_pe_audio.py
@@ -117,7 +117,7 @@ def forward(
         input_values: torch.Tensor,
         padding_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         inputs_embeds, padding_mask = self.embedder(input_values, padding_mask=padding_mask)
         inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask)
 
diff --git a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py
index 0fb693d67941..bf47a4ea519d 100644
--- a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py
+++ b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py
@@ -589,7 +589,7 @@ def forward(
         padding_mask: torch.Tensor | None = None,
         padding_mask_videos: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | PeAudioVideoEncoderOutput:
+    ) -> PeAudioVideoEncoderOutput:
         inputs_embeds, padding_mask, audio_output, video_output = self.embedder(
             input_values,
             pixel_values_videos,
diff --git a/src/transformers/models/pe_audio_video/modular_pe_audio_video.py b/src/transformers/models/pe_audio_video/modular_pe_audio_video.py
index 78bd0a044259..7127de028c6a 100644
--- a/src/transformers/models/pe_audio_video/modular_pe_audio_video.py
+++ b/src/transformers/models/pe_audio_video/modular_pe_audio_video.py
@@ -378,7 +378,7 @@ def forward(
         padding_mask: torch.Tensor | None = None,
         padding_mask_videos: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | PeAudioVideoEncoderOutput:
+    ) -> PeAudioVideoEncoderOutput:
         inputs_embeds, padding_mask, audio_output, video_output = self.embedder(
             input_values,
             pixel_values_videos,
diff --git a/src/transformers/models/pe_video/modeling_pe_video.py b/src/transformers/models/pe_video/modeling_pe_video.py
index a94e53b77dc4..7ab2e524165f 100644
--- a/src/transformers/models/pe_video/modeling_pe_video.py
+++ b/src/transformers/models/pe_video/modeling_pe_video.py
@@ -526,7 +526,7 @@ def forward(
         pixel_values_videos: torch.Tensor,
         padding_mask_videos: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         inputs_embeds, padding_mask = self.embedder(pixel_values_videos, padding_mask=padding_mask_videos)
         inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask)
 
@@ -577,7 +577,7 @@ def get_text_features(
             input_ids: torch.Tensor,
             attention_mask: torch.Tensor | None = None,
             **kwargs: Unpack[TransformersKwargs],
-        ) -> tuple | BaseModelOutputWithPooling:
+        ) -> BaseModelOutputWithPooling:
             text_outputs: BaseModelOutputWithPooling = self.text_model(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -594,7 +594,7 @@ def get_video_features(
             pixel_values_videos: torch.Tensor,
             padding_mask_videos: torch.Tensor | None = None,
             **kwargs: Unpack[TransformersKwargs],
-        ) -> tuple | BaseModelOutputWithPooling:
+        ) -> BaseModelOutputWithPooling:
             video_outputs: BaseModelOutputWithPooling = self.video_encoder(
                 pixel_values_videos=pixel_values_videos,
                 padding_mask_videos=padding_mask_videos,
diff --git a/src/transformers/models/pe_video/modular_pe_video.py b/src/transformers/models/pe_video/modular_pe_video.py
index 38576598cf44..b7bc4e1d3de1 100644
--- a/src/transformers/models/pe_video/modular_pe_video.py
+++ b/src/transformers/models/pe_video/modular_pe_video.py
@@ -107,7 +107,7 @@ def forward(
         pixel_values_videos: torch.Tensor,
         padding_mask_videos: torch.Tensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         inputs_embeds, padding_mask = self.embedder(pixel_values_videos, padding_mask=padding_mask_videos)
         inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask)
 
@@ -158,7 +158,7 @@ def get_text_features(
             input_ids: torch.Tensor,
             attention_mask: torch.Tensor | None = None,
             **kwargs: Unpack[TransformersKwargs],
-        ) -> tuple | BaseModelOutputWithPooling:
+        ) -> BaseModelOutputWithPooling:
             text_outputs: BaseModelOutputWithPooling = self.text_model(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -175,7 +175,7 @@ def get_video_features(
             pixel_values_videos: torch.Tensor,
             padding_mask_videos: torch.Tensor | None = None,
             **kwargs: Unpack[TransformersKwargs],
-        ) -> tuple | BaseModelOutputWithPooling:
+        ) -> BaseModelOutputWithPooling:
             video_outputs: BaseModelOutputWithPooling = self.video_encoder(
                 pixel_values_videos=pixel_values_videos,
                 padding_mask_videos=padding_mask_videos,
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 7a202554a6eb..1e90f4f36328 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -187,7 +187,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_tower(pixel_values.flatten(0, 1), return_dict=True, **kwargs)
         last_hidden_state = image_outputs.last_hidden_state
         if self.config.vision_use_cls_token:
@@ -255,7 +255,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **lm_kwargs,
-    ) -> tuple | PerceptionLMModelOutputWithPast:
+    ) -> PerceptionLMModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -349,7 +349,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **lm_kwargs,
-    ) -> tuple | PerceptionLMCausalLMOutputWithPast:
+    ) -> PerceptionLMCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index ccd335a84ae6..97842903d36c 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -155,7 +155,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         image_outputs = self.vision_tower(pixel_values.flatten(0, 1), return_dict=True, **kwargs)
         last_hidden_state = image_outputs.last_hidden_state
         if self.config.vision_use_cls_token:
@@ -223,7 +223,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **lm_kwargs,
-    ) -> tuple | PerceptionLMModelOutputWithPast:
+    ) -> PerceptionLMModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -336,7 +336,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **lm_kwargs,
-    ) -> tuple | PerceptionLMCausalLMOutputWithPast:
+    ) -> PerceptionLMCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
index 38995c5f167d..5e4aa50bb47d 100644
--- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
@@ -447,7 +447,7 @@ def forward(
         pixel_values,
         patch_attention_mask: torch.BoolTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         batch_size = pixel_values.size(0)
         if patch_attention_mask is None:
             patch_attention_mask = torch.ones(
@@ -1568,7 +1568,7 @@ def forward(
         output_hidden_states: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         image_pixel_values (`torch.FloatTensor`, *optional*):
             If the input contains images, these correspond to the pixel values after transformations (as returned by
diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
index 2043cd0077e9..ccd202c62785 100644
--- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py
@@ -688,7 +688,7 @@ def forward(
         pixel_values,
         patch_attention_mask: torch.BoolTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         batch_size = pixel_values.size(0)
         if patch_attention_mask is None:
             patch_attention_mask = torch.ones(
@@ -1505,7 +1505,7 @@ def forward(
         output_hidden_states: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         image_pixel_values (`torch.FloatTensor`, *optional*):
             If the input contains images, these correspond to the pixel values after transformations (as returned by
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 200871240817..315a8345b3cd 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -487,7 +487,7 @@ def forward(
         return_dict: bool | None = None,
         *args,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         if image_sizes is None:
             batch_size, _, height, width = pixel_values.shape
             image_sizes = [(height, width)] * batch_size
diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
index 24633e9ae0bc..ec9123b73bd7 100644
--- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
@@ -844,7 +844,9 @@ def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch
 
     @check_model_inputs(tie_last_hidden_states=False)
     @auto_docstring
-    def forward(self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs]):
+    def forward(
+        self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs]
+    ) -> BaseModelOutputWithPooling:
         r"""
         feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
             mel length
@@ -1254,7 +1256,7 @@ def get_window_index(self, grid_thw):
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         """
         Args:
             hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -1806,7 +1808,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1823,7 +1825,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1841,7 +1843,7 @@ def get_audio_features(
         feature_attention_mask: torch.LongTensor | None = None,
         audio_feature_lengths: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             The tensors corresponding to the input audios.
diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
index 6ec82e156cda..f251a92dea67 100644
--- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
@@ -1725,7 +1725,9 @@ def _prepare_attention_mask(self, inputs_tensor: torch.Tensor, cu_seqlens: torch
 
     @check_model_inputs(tie_last_hidden_states=False)
     @auto_docstring
-    def forward(self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs]):
+    def forward(
+        self, input_features, feature_lens=None, aftercnn_lens=None, **kwargs: Unpack[TransformersKwargs]
+    ) -> BaseModelOutputWithPooling:
         r"""
         feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
             mel length
@@ -1964,7 +1966,7 @@ def __init__(self, config: Qwen2_5OmniVisionEncoderConfig, *inputs, **kwargs) ->
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         """
         Args:
             hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -2112,7 +2114,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -2129,7 +2131,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -2147,7 +2149,7 @@ def get_audio_features(
         feature_attention_mask: torch.LongTensor | None = None,
         audio_feature_lengths: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             The tensors corresponding to the input audios.
diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 29f66d6cd204..80ccd7f47309 100644
--- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -422,7 +422,7 @@ def get_window_index(self, grid_thw):
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         """
         Args:
             hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -1185,7 +1185,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1207,7 +1207,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1468,7 +1468,7 @@ def forward(
         second_per_grid_ts: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen2_5_VLCausalLMOutputWithPast:
+    ) -> Qwen2_5_VLCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
index 6e266e062f16..87fa72915459 100644
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -285,7 +285,7 @@ def get_window_index(self, grid_thw):
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         """
         Args:
             hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -676,7 +676,7 @@ def forward(
         second_per_grid_ts: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen2_5_VLCausalLMOutputWithPast:
+    ) -> Qwen2_5_VLCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 7eb1829d17c4..508ababbdf3d 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -1119,7 +1119,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1141,7 +1141,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1364,7 +1364,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen2VLCausalLMOutputWithPast:
+    ) -> Qwen2VLCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
index cfcfcec4e2c7..d0bee4c64598 100644
--- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
@@ -759,7 +759,7 @@ def forward(
         feature_lens=None,
         aftercnn_lens=None,
         **kwargs,
-    ):
+    ) -> BaseModelOutputWithPooling:
         r"""
         feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
             mel length
@@ -1241,7 +1241,7 @@ def fast_pos_embed_interpolate(self, grid_thw):
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         """
         Args:
             hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -1758,7 +1758,7 @@ def forward(
         visual_pos_masks: torch.Tensor | None = None,
         deepstack_visual_embeds: list[torch.Tensor] | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
             The mask of the visual positions.
@@ -1989,7 +1989,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -2006,7 +2006,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -2024,7 +2024,7 @@ def get_audio_features(
         feature_attention_mask: torch.LongTensor | None = None,
         audio_feature_lengths: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             The tensors corresponding to the input audios.
@@ -2123,7 +2123,7 @@ def forward(
         cache_position=None,
         video_second_per_grid=None,
         **kwargs,
-    ) -> tuple | Qwen3OmniMoeThinkerCausalLMOutputWithPast:
+    ) -> Qwen3OmniMoeThinkerCausalLMOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -3015,7 +3015,7 @@ def forward(
         visual_pos_masks: torch.Tensor | None = None,
         deepstack_visual_embeds: list[torch.Tensor] | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
             The mask of the visual positions.
diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
index 9bb913b232b3..ff42a2357c00 100644
--- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
@@ -1460,7 +1460,7 @@ def get_audio_features(
         feature_attention_mask: torch.LongTensor | None = None,
         audio_feature_lengths: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             The tensors corresponding to the input audios.
@@ -1509,7 +1509,7 @@ def forward(
         cache_position=None,
         video_second_per_grid=None,
         **kwargs,
-    ) -> tuple | Qwen3OmniMoeThinkerCausalLMOutputWithPast:
+    ) -> Qwen3OmniMoeThinkerCausalLMOutputWithPast:
         output_router_logits = (
             output_router_logits if output_router_logits is not None else self.config.text_config.output_router_logits
         )
diff --git a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py
index a06ab3e2ca7c..ae4fb4d2647e 100644
--- a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@@ -757,7 +757,7 @@ def fast_pos_embed_interpolate(self, grid_thw):
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         """
         Args:
             hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -855,7 +855,7 @@ def forward(
         visual_pos_masks: torch.Tensor | None = None,
         deepstack_visual_embeds: list[torch.Tensor] | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
             The mask of the visual positions.
@@ -1095,7 +1095,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1112,7 +1112,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1186,7 +1186,7 @@ def forward(
         video_grid_thw: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3VLModelOutputWithPast:
+    ) -> Qwen3VLModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -1387,7 +1387,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3VLCausalLMOutputWithPast:
+    ) -> Qwen3VLCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
index 8c38b4391c57..6c72f4117407 100644
--- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
@@ -655,7 +655,7 @@ def fast_pos_embed_interpolate(self, grid_thw):
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         """
         Args:
             hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -751,7 +751,7 @@ def forward(
         visual_pos_masks: torch.Tensor | None = None,
         deepstack_visual_embeds: list[torch.Tensor] | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
             The mask of the visual positions.
@@ -969,7 +969,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -994,7 +994,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1019,7 +1019,7 @@ def forward(
         video_grid_thw: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3VLModelOutputWithPast:
+    ) -> Qwen3VLModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -1155,7 +1155,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3VLCausalLMOutputWithPast:
+    ) -> Qwen3VLCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
index d697286d0eca..84a5ab8cd47c 100644
--- a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
+++ b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
@@ -747,7 +747,7 @@ def fast_pos_embed_interpolate(self, grid_thw):
     @check_model_inputs
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         """
         Args:
             hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -934,7 +934,7 @@ def forward(
         visual_pos_masks: torch.Tensor | None = None,
         deepstack_visual_embeds: list[torch.Tensor] | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
             The mask of the visual positions.
@@ -1228,7 +1228,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1245,7 +1245,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithDeepstackFeatures:
+    ) -> BaseModelOutputWithDeepstackFeatures:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1319,7 +1319,7 @@ def forward(
         video_grid_thw: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3VLMoeModelOutputWithPast:
+    ) -> Qwen3VLMoeModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -1573,7 +1573,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3VLMoeCausalLMOutputWithPast:
+    ) -> Qwen3VLMoeCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
index 946e972d0fd9..c7a7e5e32617 100644
--- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
+++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
@@ -362,7 +362,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> Qwen3VLMoeCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index f05ef121efc8..04c735cf1510 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -606,7 +606,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         if self.config.is_decoder:
             use_cache = use_cache if use_cache is not None else self.config.use_cache
         else:
@@ -751,7 +751,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -861,7 +861,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -957,7 +957,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -1041,7 +1041,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1142,7 +1142,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -1232,7 +1232,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
diff --git a/src/transformers/models/roberta/modular_roberta.py b/src/transformers/models/roberta/modular_roberta.py
index ef1641fbccd7..84761658b9b0 100644
--- a/src/transformers/models/roberta/modular_roberta.py
+++ b/src/transformers/models/roberta/modular_roberta.py
@@ -225,7 +225,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -335,7 +335,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -431,7 +431,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -515,7 +515,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -616,7 +616,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -706,7 +706,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index eaa1a4561f6c..ccba796c6cab 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -619,7 +619,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -781,7 +781,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -897,7 +897,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -995,7 +995,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -1080,7 +1080,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1182,7 +1182,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -1274,7 +1274,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index a43e603a23aa..58b46f2aa51c 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -692,7 +692,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the shape vocabulary.
@@ -858,7 +858,7 @@ def forward(
         labels_attention_mask: torch.Tensor | None = None,
         labels_token_type_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the shape vocabulary.
@@ -1054,7 +1054,7 @@ def forward(
         encoder_attention_mask: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the shape vocabulary.
@@ -1213,7 +1213,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the shape vocabulary.
@@ -1350,7 +1350,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the shape vocabulary.
@@ -1448,7 +1448,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1572,7 +1572,7 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         labels: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the shape vocabulary.
@@ -1648,7 +1648,7 @@ def forward(
         start_positions: torch.Tensor | None = None,
         end_positions: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         input_shape_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the shape vocabulary.
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index ef1181ba641a..18b5a86eb8c9 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -1056,7 +1056,7 @@ def get_input_embeddings(self):
     @check_model_inputs(tie_last_hidden_states=False)
     def forward(
         self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | SamVisionEncoderOutput:
+    ) -> SamVisionEncoderOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
diff --git a/src/transformers/models/sam2/modeling_sam2.py b/src/transformers/models/sam2/modeling_sam2.py
index 7d93142aff65..e0c11cb564e9 100644
--- a/src/transformers/models/sam2/modeling_sam2.py
+++ b/src/transformers/models/sam2/modeling_sam2.py
@@ -623,7 +623,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam2HieraDetModelOutput:
+    ) -> Sam2HieraDetModelOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -675,7 +675,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam2VisionEncoderOutput:
+    ) -> Sam2VisionEncoderOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -1569,7 +1569,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam2VisionEncoderOutput:
+    ) -> Sam2VisionEncoderOutput:
         r"""
         pixel_values (`torch.FloatTensor`):
             Input pixel values of shape `(batch_size, num_channels, height, width)`.
diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py
index 42f30cafd05a..3d0e8e530a36 100644
--- a/src/transformers/models/sam2/modular_sam2.py
+++ b/src/transformers/models/sam2/modular_sam2.py
@@ -728,7 +728,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam2HieraDetModelOutput:
+    ) -> Sam2HieraDetModelOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -780,7 +780,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam2VisionEncoderOutput:
+    ) -> Sam2VisionEncoderOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -1246,7 +1246,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam2VisionEncoderOutput:
+    ) -> Sam2VisionEncoderOutput:
         r"""
         pixel_values (`torch.FloatTensor`):
             Input pixel values of shape `(batch_size, num_channels, height, width)`.
diff --git a/src/transformers/models/sam2_video/modeling_sam2_video.py b/src/transformers/models/sam2_video/modeling_sam2_video.py
index 12a9dfc4d1be..9a626a1a4f8d 100644
--- a/src/transformers/models/sam2_video/modeling_sam2_video.py
+++ b/src/transformers/models/sam2_video/modeling_sam2_video.py
@@ -1839,7 +1839,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam2VideoVisionEncoderOutput:
+    ) -> Sam2VideoVisionEncoderOutput:
         r"""
         pixel_values (`torch.FloatTensor`):
             Input pixel values of shape `(batch_size, num_channels, height, width)`.
diff --git a/src/transformers/models/sam3/modeling_sam3.py b/src/transformers/models/sam3/modeling_sam3.py
index f068c9f61a95..f4e85e152b5e 100644
--- a/src/transformers/models/sam3/modeling_sam3.py
+++ b/src/transformers/models/sam3/modeling_sam3.py
@@ -1031,7 +1031,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam3VisionEncoderOutput:
+    ) -> Sam3VisionEncoderOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -1399,7 +1399,7 @@ def forward(
         text_mask: torch.Tensor | None = None,
         spatial_sizes: list[tuple[int, int]] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam3DETREncoderOutput:
+    ) -> Sam3DETREncoderOutput:
         """
         Forward pass for the DETR encoder.
 
@@ -1694,7 +1694,7 @@ def forward(
         text_mask: torch.Tensor | None = None,
         spatial_shapes: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam3DETRDecoderOutput:
+    ) -> Sam3DETRDecoderOutput:
         """
         Forward pass for the DETR decoder.
 
@@ -2010,7 +2010,7 @@ def forward(
         prompt_features: torch.Tensor | None = None,
         prompt_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam3MaskDecoderOutput:
+    ) -> Sam3MaskDecoderOutput:
         """
         Args:
             decoder_queries: Decoder output queries [batch_size, num_queries, hidden_size]
@@ -2147,7 +2147,7 @@ def get_text_features(
         input_ids: torch.LongTensor,
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Example:
 
diff --git a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py
index 78674ca80427..929e15108019 100644
--- a/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py
+++ b/src/transformers/models/sam3_tracker/modeling_sam3_tracker.py
@@ -1065,7 +1065,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam3TrackerVisionEncoderOutput:
+    ) -> Sam3TrackerVisionEncoderOutput:
         r"""
         pixel_values (`torch.FloatTensor`):
             Input pixel values of shape `(batch_size, num_channels, height, width)`.
diff --git a/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py
index 6a07c21dbd18..80f6cc1fba4d 100644
--- a/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py
+++ b/src/transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py
@@ -1862,7 +1862,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam3TrackerVideoVisionEncoderOutput:
+    ) -> Sam3TrackerVideoVisionEncoderOutput:
         r"""
         pixel_values (`torch.FloatTensor`):
             Input pixel values of shape `(batch_size, num_channels, height, width)`.
diff --git a/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py b/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py
index 1d357fe923b0..44e46de7f37c 100644
--- a/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py
+++ b/src/transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py
@@ -550,7 +550,7 @@ def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Sam3TrackerVideoVisionEncoderOutput:
+    ) -> Sam3TrackerVideoVisionEncoderOutput:
         r"""
         pixel_values (`torch.FloatTensor`):
             Input pixel values of shape `(batch_size, num_channels, height, width)`.
diff --git a/src/transformers/models/sam_hq/modeling_sam_hq.py b/src/transformers/models/sam_hq/modeling_sam_hq.py
index 176d875a4000..19920b563a1a 100644
--- a/src/transformers/models/sam_hq/modeling_sam_hq.py
+++ b/src/transformers/models/sam_hq/modeling_sam_hq.py
@@ -556,7 +556,7 @@ def get_input_embeddings(self):
     @check_model_inputs(tie_last_hidden_states=False)
     def forward(
         self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | SamHQVisionEncoderOutput:
+    ) -> SamHQVisionEncoderOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
diff --git a/src/transformers/models/sam_hq/modular_sam_hq.py b/src/transformers/models/sam_hq/modular_sam_hq.py
index 481345905cd8..ba6486d7cf4a 100644
--- a/src/transformers/models/sam_hq/modular_sam_hq.py
+++ b/src/transformers/models/sam_hq/modular_sam_hq.py
@@ -192,7 +192,7 @@ class SamHQVisionEncoder(SamVisionEncoder, SamHQPreTrainedModel):
     @check_model_inputs(tie_last_hidden_states=False)
     def forward(
         self, pixel_values: torch.FloatTensor | None = None, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | SamHQVisionEncoderOutput:
+    ) -> SamHQVisionEncoderOutput:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index 657d5e3e4554..9b3a245b7e93 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -791,7 +791,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -821,7 +821,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/siglip2/modeling_siglip2.py b/src/transformers/models/siglip2/modeling_siglip2.py
index 3182fac5ca8c..f050682b0195 100644
--- a/src/transformers/models/siglip2/modeling_siglip2.py
+++ b/src/transformers/models/siglip2/modeling_siglip2.py
@@ -849,7 +849,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -880,7 +880,7 @@ def get_image_features(
         pixel_attention_mask: torch.Tensor | None = None,
         spatial_shapes: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
             Mask to avoid performing attention on padding pixel indices.
diff --git a/src/transformers/models/siglip2/modular_siglip2.py b/src/transformers/models/siglip2/modular_siglip2.py
index 9c022de838e9..0220ed131ae4 100644
--- a/src/transformers/models/siglip2/modular_siglip2.py
+++ b/src/transformers/models/siglip2/modular_siglip2.py
@@ -367,7 +367,7 @@ def get_image_features(
         pixel_attention_mask: torch.Tensor | None = None,
         spatial_shapes: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
             Mask to avoid performing attention on padding pixel indices.
diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py
index 745817caa8b6..23d24234cc0a 100644
--- a/src/transformers/models/smolvlm/modeling_smolvlm.py
+++ b/src/transformers/models/smolvlm/modeling_smolvlm.py
@@ -348,7 +348,7 @@ def forward(
         pixel_values,
         patch_attention_mask: torch.BoolTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         batch_size = pixel_values.size(0)
         if patch_attention_mask is None:
             patch_size = self.patch_size
@@ -533,7 +533,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         pixel_attention_mask: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -608,7 +608,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | SmolVLMBaseModelOutputWithPast:
+    ) -> SmolVLMBaseModelOutputWithPast:
         r"""
         pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
             Mask to avoid performing attention on padding pixel indices.
@@ -778,7 +778,7 @@ def forward(
         return_dict: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | SmolVLMCausalLMOutputWithPast:
+    ) -> SmolVLMCausalLMOutputWithPast:
         r"""
         pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
             Mask to avoid performing attention on padding pixel indices.
diff --git a/src/transformers/models/smolvlm/modular_smolvlm.py b/src/transformers/models/smolvlm/modular_smolvlm.py
index 2298499b670c..e0c7a8b297c8 100644
--- a/src/transformers/models/smolvlm/modular_smolvlm.py
+++ b/src/transformers/models/smolvlm/modular_smolvlm.py
@@ -201,7 +201,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         pixel_attention_mask: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -276,7 +276,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | SmolVLMBaseModelOutputWithPast:
+    ) -> SmolVLMBaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 8e930c0b604e..2f7d6395c5d5 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -296,7 +296,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -374,7 +374,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `batch_size, sequence_length`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index 8613fef45695..100da24a3c1d 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -364,7 +364,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/starcoder2/modular_starcoder2.py b/src/transformers/models/starcoder2/modular_starcoder2.py
index f6fd0841b217..478593ecb60d 100644
--- a/src/transformers/models/starcoder2/modular_starcoder2.py
+++ b/src/transformers/models/starcoder2/modular_starcoder2.py
@@ -153,7 +153,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index cb30a42ac764..9745fd8270b1 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -691,7 +691,7 @@ def forward(
         use_cache=None,
         cache_position=None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoEModelOutputWithPastAndCrossAttentions:
+    ) -> MoEModelOutputWithPastAndCrossAttentions:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -949,7 +949,7 @@ def forward(
         decoder_inputs_embeds: torch.Tensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | Seq2SeqMoEModelOutput:
+    ) -> Seq2SeqMoEModelOutput:
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs
@@ -1104,7 +1104,7 @@ def forward(
         output_router_logits: bool | None = False,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | Seq2SeqMoEOutput:
+    ) -> Seq2SeqMoEOutput:
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs
@@ -1242,7 +1242,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | MoEModelOutput:
+    ) -> MoEModelOutput:
         use_cache = False
         encoder_outputs = self.encoder(
             input_ids=input_ids,
diff --git a/src/transformers/models/switch_transformers/modular_switch_transformers.py b/src/transformers/models/switch_transformers/modular_switch_transformers.py
index 64f91cc20c39..6d0a9f31a81a 100644
--- a/src/transformers/models/switch_transformers/modular_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modular_switch_transformers.py
@@ -447,7 +447,7 @@ def forward(
         use_cache=None,
         cache_position=None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MoEModelOutputWithPastAndCrossAttentions:
+    ) -> MoEModelOutputWithPastAndCrossAttentions:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -705,7 +705,7 @@ def forward(
         decoder_inputs_embeds: torch.Tensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | Seq2SeqMoEModelOutput:
+    ) -> Seq2SeqMoEModelOutput:
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs
@@ -795,7 +795,7 @@ def forward(
         output_router_logits: bool | None = False,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | Seq2SeqMoEOutput:
+    ) -> Seq2SeqMoEOutput:
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs
@@ -933,7 +933,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | MoEModelOutput:
+    ) -> MoEModelOutput:
         use_cache = False
         encoder_outputs = self.encoder(
             input_ids=input_ids,
diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py
index f6bc2ca10eae..0874f476e9ad 100644
--- a/src/transformers/models/t5gemma/modeling_t5gemma.py
+++ b/src/transformers/models/t5gemma/modeling_t5gemma.py
@@ -691,7 +691,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -790,7 +790,7 @@ def forward(
         encoder_hidden_states: torch.Tensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPastAndCrossAttentions:
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if encoder_hidden_states is None:
@@ -1031,7 +1031,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput:
+    ) -> Seq2SeqLMOutput:
         r"""
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py
index c55d20ba7b66..bbfe9542737e 100644
--- a/src/transformers/models/t5gemma/modular_t5gemma.py
+++ b/src/transformers/models/t5gemma/modular_t5gemma.py
@@ -701,7 +701,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -800,7 +800,7 @@ def forward(
         encoder_hidden_states: torch.Tensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPastAndCrossAttentions:
+    ) -> BaseModelOutputWithPastAndCrossAttentions:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if encoder_hidden_states is None:
@@ -1041,7 +1041,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput:
+    ) -> Seq2SeqLMOutput:
         r"""
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
diff --git a/src/transformers/models/t5gemma2/modeling_t5gemma2.py b/src/transformers/models/t5gemma2/modeling_t5gemma2.py
index da8bdc4905bc..d6812713b61a 100644
--- a/src/transformers/models/t5gemma2/modeling_t5gemma2.py
+++ b/src/transformers/models/t5gemma2/modeling_t5gemma2.py
@@ -811,7 +811,7 @@ def __init__(
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         # pixel_values: (batch_size, channels, height, width)
         # image_features: Image feature tensor of shape (num_images, image_length, embed_dim).
         vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs)
@@ -1256,7 +1256,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput:
+    ) -> Seq2SeqLMOutput:
         r"""
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
diff --git a/src/transformers/models/t5gemma2/modular_t5gemma2.py b/src/transformers/models/t5gemma2/modular_t5gemma2.py
index 3245a041d20b..9e7d8df54e31 100644
--- a/src/transformers/models/t5gemma2/modular_t5gemma2.py
+++ b/src/transformers/models/t5gemma2/modular_t5gemma2.py
@@ -863,7 +863,7 @@ def __init__(
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         # pixel_values: (batch_size, channels, height, width)
         # image_features: Image feature tensor of shape (num_images, image_length, embed_dim).
         vision_outputs = self.vision_tower(pixel_values=pixel_values, return_dict=True, **kwargs)
@@ -1295,7 +1295,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | Seq2SeqLMOutput:
+    ) -> Seq2SeqLMOutput:
         r"""
         decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
             Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
diff --git a/src/transformers/models/video_llama_3/modeling_video_llama_3.py b/src/transformers/models/video_llama_3/modeling_video_llama_3.py
index d19162af17a3..917487d2335e 100644
--- a/src/transformers/models/video_llama_3/modeling_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/modeling_video_llama_3.py
@@ -351,7 +351,7 @@ def forward(
         cu_seqlens: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         cu_seqlens (`torch.Tensor` of shape `(num_images_or_videos + 1,)`):
             The cumulative sequence lengths of each image or video feature.
@@ -444,7 +444,7 @@ def forward(
         grid_thw: torch.Tensor,
         merge_sizes: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         grid_thw (`torch.LongTensor` of shape `(num_images_or_videos, 3)`):
             The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values.
@@ -554,7 +554,7 @@ def get_video_features(
         video_grid_thw: torch.LongTensor,
         video_merge_sizes: torch.LongTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -578,7 +578,7 @@ def get_image_features(
         image_grid_thw: torch.LongTensor,
         image_merge_sizes: torch.LongTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -662,7 +662,7 @@ def forward(
         video_compression_mask: torch.BoolTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | VideoLlama3ModelOutputWithPast:
+    ) -> VideoLlama3ModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -829,7 +829,7 @@ def forward(
         video_compression_mask: torch.BoolTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | VideoLlama3CausalLMOutputWithPast:
+    ) -> VideoLlama3CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/video_llama_3/modular_video_llama_3.py b/src/transformers/models/video_llama_3/modular_video_llama_3.py
index caeb8483afdd..6bd2ad57568b 100644
--- a/src/transformers/models/video_llama_3/modular_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/modular_video_llama_3.py
@@ -416,7 +416,7 @@ def forward(
         cu_seqlens: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         cu_seqlens (`torch.Tensor` of shape `(num_images_or_videos + 1,)`):
             The cumulative sequence lengths of each image or video feature.
@@ -499,7 +499,7 @@ def forward(
         grid_thw: torch.Tensor,
         merge_sizes: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         grid_thw (`torch.LongTensor` of shape `(num_images_or_videos, 3)`):
             The temporal, height and width dimensions of feature shape for each image. Each row contains [t, h, w] values.
@@ -602,7 +602,7 @@ def get_video_features(
         video_grid_thw: torch.LongTensor,
         video_merge_sizes: torch.LongTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -626,7 +626,7 @@ def get_image_features(
         image_grid_thw: torch.LongTensor,
         image_merge_sizes: torch.LongTensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -669,7 +669,7 @@ def forward(
         video_compression_mask: torch.BoolTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | VideoLlama3ModelOutputWithPast:
+    ) -> VideoLlama3ModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -793,7 +793,7 @@ def forward(
         video_compression_mask: torch.BoolTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | VideoLlama3CausalLMOutputWithPast:
+    ) -> VideoLlama3CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index fe86cbb2512f..053bd296e09b 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -181,7 +181,7 @@ def get_image_features(
         vision_feature_select_strategy: str | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_images (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
             The tensors corresponding to the input images.
@@ -228,7 +228,7 @@ def get_video_features(
         vision_feature_layer: int | list[int] | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
             The tensors corresponding to the input videos.
@@ -320,7 +320,7 @@ def forward(
         return_dict: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | VideoLlavaModelOutputWithPast:
+    ) -> VideoLlavaModelOutputWithPast:
         r"""
         pixel_values_images (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
             The tensors corresponding to the input images. Pixel values can be obtained using
@@ -462,7 +462,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | VideoLlavaCausalLMOutputWithPast:
+    ) -> VideoLlavaCausalLMOutputWithPast:
         r"""
         pixel_values_images (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
             The tensors corresponding to the input images. Pixel values can be obtained using
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index a5d287c94b31..199e2060ddd5 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -160,7 +160,7 @@ def get_image_features(
         vision_feature_layers: int | list[int] | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
             The tensors corresponding to the input images.
@@ -351,7 +351,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **lm_kwargs,
-    ) -> tuple | VipLlavaCausalLMOutputWithPast:
+    ) -> VipLlavaCausalLMOutputWithPast:
         r"""
         vision_feature_layers (`Union[int, list[int]]`, *optional*):
             The vision feature layer, or the list of indexes of the layers to select
diff --git a/src/transformers/models/vipllava/modular_vipllava.py b/src/transformers/models/vipllava/modular_vipllava.py
index 3b9a2dfdf284..c25d77bb8a05 100644
--- a/src/transformers/models/vipllava/modular_vipllava.py
+++ b/src/transformers/models/vipllava/modular_vipllava.py
@@ -81,7 +81,7 @@ def get_image_features(
         vision_feature_layers: int | list[int] | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
             The tensors corresponding to the input images.
@@ -218,7 +218,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **lm_kwargs,
-    ) -> tuple | VipLlavaCausalLMOutputWithPast:
+    ) -> VipLlavaCausalLMOutputWithPast:
         r"""
         vision_feature_layers (`Union[int, list[int]]`, *optional*):
             The vision feature layer, or the list of indexes of the layers to select
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index 77aa6feb384a..9f97373a907e 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -111,7 +111,7 @@ def get_text_features(
         position_ids: torch.Tensor | None = None,
         token_type_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -143,7 +143,7 @@ def get_text_features(
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
index b5da48a61e4c..96c4d9786c53 100644
--- a/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
+++ b/src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -397,7 +397,7 @@ def forward(
         dataset_index: torch.Tensor | None = None,
         output_hidden_states: bool | None = None,
         **kwargs,
-    ):
+    ) -> BackboneOutput:
         r"""
         dataset_index (`torch.Tensor` of shape `(batch_size,)`):
             Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.
diff --git a/src/transformers/models/vjepa2/modeling_vjepa2.py b/src/transformers/models/vjepa2/modeling_vjepa2.py
index 332e02108056..40df7132087c 100644
--- a/src/transformers/models/vjepa2/modeling_vjepa2.py
+++ b/src/transformers/models/vjepa2/modeling_vjepa2.py
@@ -1087,7 +1087,7 @@ def forward(
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         **kwargs,
-    ) -> tuple | ImageClassifierOutput:
+    ) -> ImageClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
diff --git a/src/transformers/models/voxtral/modeling_voxtral.py b/src/transformers/models/voxtral/modeling_voxtral.py
index 594ca09efe55..40d07ae24677 100644
--- a/src/transformers/models/voxtral/modeling_voxtral.py
+++ b/src/transformers/models/voxtral/modeling_voxtral.py
@@ -296,7 +296,7 @@ def forward(
         input_features,
         attention_mask=None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Args:
             input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
@@ -403,7 +403,7 @@ def get_decoder(self):
     )
     def get_audio_features(
         self, input_features: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
diff --git a/src/transformers/models/voxtral/modular_voxtral.py b/src/transformers/models/voxtral/modular_voxtral.py
index 7fddbcd29648..d1990cc2f1bb 100644
--- a/src/transformers/models/voxtral/modular_voxtral.py
+++ b/src/transformers/models/voxtral/modular_voxtral.py
@@ -71,7 +71,7 @@ def forward(
         input_features,
         attention_mask=None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Args:
             input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
@@ -169,7 +169,7 @@ def get_decoder(self):
     )
     def get_audio_features(
         self, input_features: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index f08c4a695e2b..109c51912612 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -582,7 +582,7 @@ def forward(
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -1211,7 +1211,7 @@ def get_text_features(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1244,7 +1244,7 @@ def get_video_features(
         self,
         pixel_values: torch.Tensor,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index e8006b07e22d..96060f7afc22 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -617,7 +617,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         if self.config.is_decoder:
             use_cache = use_cache if use_cache is not None else self.config.use_cache
         else:
@@ -760,7 +760,7 @@ def forward(
         use_cache: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -869,7 +869,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -965,7 +965,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -1049,7 +1049,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1150,7 +1150,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -1218,7 +1218,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
diff --git a/src/transformers/models/xlm_roberta/modular_xlm_roberta.py b/src/transformers/models/xlm_roberta/modular_xlm_roberta.py
index f9404c0d7993..024e0f376a79 100644
--- a/src/transformers/models/xlm_roberta/modular_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modular_xlm_roberta.py
@@ -83,7 +83,7 @@ def forward(
         use_cache: bool | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -177,7 +177,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -246,7 +246,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -326,7 +326,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -419,7 +419,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
@@ -483,7 +483,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index c765000ca274..d83ccf713fc2 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -613,7 +613,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         if self.config.is_decoder:
             use_cache = use_cache if use_cache is not None else self.config.use_cache
         else:
@@ -802,7 +802,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
@@ -901,7 +901,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -963,7 +963,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -1035,7 +1035,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
@@ -1126,7 +1126,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -1191,7 +1191,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         outputs = self.roberta(
             input_ids,
             attention_mask=attention_mask,
diff --git a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py
index df51342415e9..208db351d138 100644
--- a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py
@@ -303,7 +303,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
@@ -402,7 +402,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
@@ -464,7 +464,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
@@ -536,7 +536,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
@@ -627,7 +627,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
@@ -692,7 +692,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         outputs = self.roberta(
             input_ids,
             attention_mask=attention_mask,
diff --git a/src/transformers/models/xlstm/modeling_xlstm.py b/src/transformers/models/xlstm/modeling_xlstm.py
index 33f263f7f54b..d428e0f203ba 100644
--- a/src/transformers/models/xlstm/modeling_xlstm.py
+++ b/src/transformers/models/xlstm/modeling_xlstm.py
@@ -1413,7 +1413,7 @@ def forward(
         use_cache: bool | None = None,
         output_hidden_states: bool | None = None,
         **kwargs,
-    ) -> tuple | xLSTMOutput:
+    ) -> xLSTMOutput:
         r"""
         cache_params (`xLSTMCache`, *optional*):
             The xLSTMCache that carries the RNN states.
@@ -1581,7 +1581,7 @@ def forward(
         use_cache: bool | None = None,
         output_hidden_states: bool | None = None,
         **kwargs,
-    ) -> tuple | xLSTMCausalLMOutput:
+    ) -> xLSTMCausalLMOutput:
         r"""
         cache_params (`xLSTMCache`, *optional*):
             The xLSTMCache that carries the RNN states.
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index c59bbc4bb022..38e60923a586 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -719,7 +719,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         r"""
         lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of the language adapters that should be activated for each sample, respectively. Default: the index
@@ -885,7 +885,7 @@ def forward(
         cache_position: torch.Tensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | CausalLMOutputWithCrossAttentions:
+    ) -> CausalLMOutputWithCrossAttentions:
         r"""
         lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of the language adapters that should be activated for each sample, respectively. Default: the index
@@ -995,7 +995,7 @@ def forward(
         encoder_attention_mask: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+    ) -> MaskedLMOutput:
         r"""
         lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of the language adapters that should be activated for each sample, respectively. Default: the index
@@ -1087,7 +1087,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+    ) -> SequenceClassifierOutput:
         r"""
         lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of the language adapters that should be activated for each sample, respectively. Default: the index
@@ -1166,7 +1166,7 @@ def forward(
         position_ids: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | MultipleChoiceModelOutput:
+    ) -> MultipleChoiceModelOutput:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
@@ -1271,7 +1271,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+    ) -> TokenClassifierOutput:
         r"""
         lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of the language adapters that should be activated for each sample, respectively. Default: the index
@@ -1357,7 +1357,7 @@ def forward(
         start_positions: torch.LongTensor | None = None,
         end_positions: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+    ) -> QuestionAnsweringModelOutput:
         r"""
         lang_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of the language adapters that should be activated for each sample, respectively. Default: the index

From abaca9c34049cb47a317066a1e5d28dcf16398ec Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 13:39:01 +0100
Subject: [PATCH 04/20] Fix blt by removing check_model_inputs; matching other
 classes

This is the main actual change, the rest is just typings
---
 src/transformers/models/blt/modeling_blt.py | 5 ++---
 src/transformers/models/blt/modular_blt.py  | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/blt/modeling_blt.py b/src/transformers/models/blt/modeling_blt.py
index 0dea1ef44a67..cbb0a5484e68 100644
--- a/src/transformers/models/blt/modeling_blt.py
+++ b/src/transformers/models/blt/modeling_blt.py
@@ -742,7 +742,6 @@ def __init__(self, config: BltLocalDecoderConfig):
 
         self.post_init()
 
-    @check_model_inputs
     def forward(
         self,
         input_ids: torch.LongTensor | None = None,
@@ -1226,7 +1225,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1403,7 +1402,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         cross_attention_states (`torch.FloatTensor`, *optional*):
             Output of the vision model, used for cross-attention. This tensor contains the processed image features that
diff --git a/src/transformers/models/blt/modular_blt.py b/src/transformers/models/blt/modular_blt.py
index e27a24ee59d7..86507d750dc0 100644
--- a/src/transformers/models/blt/modular_blt.py
+++ b/src/transformers/models/blt/modular_blt.py
@@ -673,7 +673,6 @@ def __init__(self, config: BltLocalDecoderConfig):
 
         self.post_init()
 
-    @check_model_inputs
     def forward(
         self,
         input_ids: torch.LongTensor | None = None,
@@ -950,7 +949,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
@@ -1118,7 +1117,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         # Call parent forward but exclude cross_attention_states from model call
         outputs = self.model(
             input_ids=input_ids,

From 3652612d58c7e65fed9ab07e4aa04352f2d81472 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 13:46:47 +0100
Subject: [PATCH 05/20] Use can_return_tuple on altclip

And remove return_dict from altclip and clap: the can_return_tuple should take care of it fully
---
 src/transformers/models/altclip/modeling_altclip.py | 5 ++---
 src/transformers/models/clap/modeling_clap.py       | 2 --
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 62ffd1cc8154..9b6d63ac8298 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -964,6 +964,7 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embeddings.word_embeddings = value
 
+    @can_return_tuple
     @auto_docstring
     # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
     def forward(
@@ -975,14 +976,12 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
-        return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
+    ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index b799bb92c75e..37bad5056c97 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1456,14 +1456,12 @@ def forward(
         inputs_embeds: torch.Tensor | None = None,
         output_attentions: bool | None = None,
         output_hidden_states: bool | None = None,
-        return_dict: bool | None = None,
         **kwargs,
     ) -> BaseModelOutputWithPoolingAndCrossAttentions:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")

From 9d9666a099650eeb13f64a39a70ce492c3cecba2 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 13:54:55 +0100
Subject: [PATCH 06/20] Add docstring for check_model_inputs / can_return_tuple

---
 src/transformers/utils/generic.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index b459091afb68..62a791b6dd56 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -827,6 +827,9 @@ def can_return_tuple(func: Callable[P, T]) -> Callable[P, tuple | T]:
     Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or
     use_return_dict=False is set in the config.
 
+    The wrapped method or function should not be typed like `tuple | X`, but instead just `X`, where `X` is the
+    original return type. This decorator's typing ensures that the return type is correctly represented as `tuple | X`.
+
     Note:
         output.to_tuple() convert output to tuple skipping all `None` values.
     """
@@ -872,6 +875,9 @@ def check_model_inputs(
     Decorator to intercept specific layer outputs without using hooks.
     Compatible with torch.compile (Dynamo tracing).
 
+    The wrapped method or function should not be typed like `tuple | X`, but instead just `X`, where `X` is the
+    original return type. This decorator's typing ensures that the return type is correctly represented as `tuple | X`.
+
     Args:
         tie_last_hidden_states (`bool`, *optional*, defaults to `True`):
             Whether to overwrite `out.hidden_states[-1]` with the `out.last_hidden_state`.

From 99d5deb59bf238d8a8bb7fad2e429dc36d8ea610 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 14:06:49 +0100
Subject: [PATCH 07/20] Import ParamSpec via generic as we're in Python 3.10+

---
 src/transformers/utils/generic.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 62a791b6dd56..28d7c72e1975 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -25,10 +25,9 @@
 from dataclasses import dataclass, fields, is_dataclass
 from enum import Enum
 from functools import partial, wraps
-from typing import Any, Optional, TypedDict, TypeVar
+from typing import Any, Optional, ParamSpec, TypedDict, TypeVar
 
 import numpy as np
-from typing_extensions import ParamSpec
 
 from ..utils import logging
 from .import_utils import is_mlx_available, is_torch_available, is_torch_fx_proxy, requires

From 6ad9269daa6470fb9bc24513961ccc833f1ea517 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 14:18:35 +0100
Subject: [PATCH 08/20] Add check_decorator_return_types to fix-repo &
 check-repo

---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index d2b1fa8ccb7f..e3681a5044bd 100644
--- a/Makefile
+++ b/Makefile
@@ -35,6 +35,7 @@ check-repo:
 	-python utils/check_config_docstrings.py
 	-python utils/check_config_attributes.py
 	-python utils/check_doctest_list.py
+	-python utils/check_decorator_return_types.py
 	-python utils/update_metadata.py --check-only  
 	-python utils/add_dates.py --check-only
 	-@{ \
@@ -56,6 +57,7 @@ fix-repo: style
 	-python utils/check_pipeline_typing.py --fix_and_overwrite
 	-python utils/check_doctest_list.py --fix_and_overwrite
 	-python utils/check_docstrings.py --fix_and_overwrite
+	-python utils/check_decorator_return_types.py --fix_and_overwrite
 	-python utils/add_dates.py
 
 

From a9afbdb2bc83ede5781438e67234b4045c047a17 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:35:08 +0100
Subject: [PATCH 09/20] Remove some dead code

---
 utils/check_decorator_return_types.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py
index 2c6cbb3d5ac0..cb96a3e01a03 100644
--- a/utils/check_decorator_return_types.py
+++ b/utils/check_decorator_return_types.py
@@ -196,7 +196,7 @@ def _is_delegating_to_super(func_node: ast.AST) -> bool:
     return False
 
 
-def _collect_decorated_functions(tree: ast.AST, file_path: str) -> list[tuple[ast.AST, str]]:
+def _collect_decorated_functions(tree: ast.AST) -> list[tuple[ast.AST, str]]:
     """Return (function_node, decorator_name) pairs for targeted decorators."""
 
     functions: list[tuple[ast.AST, str]] = []
@@ -254,7 +254,7 @@ def check_decorator_return_types(overwrite: bool = False):
             print(f"Skipping {file_path} due to SyntaxError: {e}")
             continue
 
-        functions = _collect_decorated_functions(tree, file_path)
+        functions = _collect_decorated_functions(tree)
         if not functions:
             continue
 

From aa97fc675a29859b43d8e856b39ffdc5715580a7 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:38:13 +0100
Subject: [PATCH 10/20] Update _decorator_name to not check for our targets yet

---
 utils/check_decorator_return_types.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py
index cb96a3e01a03..e166ae6a441b 100644
--- a/utils/check_decorator_return_types.py
+++ b/utils/check_decorator_return_types.py
@@ -68,7 +68,7 @@ def _iter_python_files(root: str) -> Iterable[str]:
 
 
 def _decorator_name(node: ast.expr) -> str | None:
-    """Return the simple name of a decorator, if it matches a target.
+    """Return the simple name of a decorator
 
     Handles forms like:
     - @can_return_tuple
@@ -77,19 +77,12 @@ def _decorator_name(node: ast.expr) -> str | None:
     - @utils.check_model_inputs(...)
     """
 
-    target = node
-    if isinstance(target, ast.Call):
-        target = target.func
+    target = node.func if isinstance(node, ast.Call) else node
 
     if isinstance(target, ast.Name):
-        name = target.id
+        return target.id
     elif isinstance(target, ast.Attribute):
-        name = target.attr
-    else:
-        return None
-
-    if name in TARGET_DECORATORS:
-        return name
+        return target.attr
     return None
 
 
@@ -207,7 +200,7 @@ def _collect_decorated_functions(tree: ast.AST) -> list[tuple[ast.AST, str]]:
             continue
         for deco in node.decorator_list:
             name = _decorator_name(deco)
-            if name is not None:
+            if name in TARGET_DECORATORS:
                 functions.append((node, name))
                 break
     return functions

From f71a072f874806ec9c558b5ddce45ece00c949da Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:41:56 +0100
Subject: [PATCH 11/20] Use src/transformers/models path instead

---
 utils/check_decorator_return_types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py
index e166ae6a441b..37a37a8f1519 100644
--- a/utils/check_decorator_return_types.py
+++ b/utils/check_decorator_return_types.py
@@ -39,7 +39,7 @@
 from dataclasses import dataclass
 
 
-PATH_TO_TRANSFORMERS = "src/transformers"
+PATH_TO_TRANSFORMERS = "src/transformers/models"
 
 
 TARGET_DECORATORS = {"can_return_tuple", "check_model_inputs"}
@@ -255,8 +255,8 @@ def check_decorator_return_types(overwrite: bool = False):
 
         for func_node, decorator_name in functions:
             # Ignore trivial delegations like `return super(...` or `super(...`.
-            if _is_delegating_to_super(func_node):
-                continue
+            # if _is_delegating_to_super(func_node):
+            #     continue
 
             returns = func_node.returns
 

From 63155b62a72ec998ee188ab178996293c5e5a81d Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:42:36 +0100
Subject: [PATCH 12/20] Revert "Use src/transformers/models path instead"

This reverts commit f71a072f874806ec9c558b5ddce45ece00c949da.
---
 utils/check_decorator_return_types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py
index 37a37a8f1519..e166ae6a441b 100644
--- a/utils/check_decorator_return_types.py
+++ b/utils/check_decorator_return_types.py
@@ -39,7 +39,7 @@
 from dataclasses import dataclass
 
 
-PATH_TO_TRANSFORMERS = "src/transformers/models"
+PATH_TO_TRANSFORMERS = "src/transformers"
 
 
 TARGET_DECORATORS = {"can_return_tuple", "check_model_inputs"}
@@ -255,8 +255,8 @@ def check_decorator_return_types(overwrite: bool = False):
 
         for func_node, decorator_name in functions:
             # Ignore trivial delegations like `return super(...` or `super(...`.
-            # if _is_delegating_to_super(func_node):
-            #     continue
+            if _is_delegating_to_super(func_node):
+                continue
 
             returns = func_node.returns
 

From 864a6a7e0559fff9429c0c1be861f05fe95064f1 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:43:22 +0100
Subject: [PATCH 13/20] Use src/transformers/models path instead

Without extra code; this time
---
 utils/check_decorator_return_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py
index e166ae6a441b..236b73917c69 100644
--- a/utils/check_decorator_return_types.py
+++ b/utils/check_decorator_return_types.py
@@ -39,7 +39,7 @@
 from dataclasses import dataclass
 
 
-PATH_TO_TRANSFORMERS = "src/transformers"
+PATH_TO_TRANSFORMERS = "src/transformers/models"
 
 
 TARGET_DECORATORS = {"can_return_tuple", "check_model_inputs"}

From 456ee32a96a590486d16d8a7fd842cabd9ac5a5c Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:46:32 +0100
Subject: [PATCH 14/20] Simplify _is_none_annotation

---
 utils/check_decorator_return_types.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py
index 236b73917c69..d64d958e385f 100644
--- a/utils/check_decorator_return_types.py
+++ b/utils/check_decorator_return_types.py
@@ -87,18 +87,11 @@ def _decorator_name(node: ast.expr) -> str | None:
 
 
 def _is_none_annotation(returns: ast.expr | None) -> bool:
-    if returns is None:
-        return True
-
-    # -> None
-    if isinstance(returns, ast.Constant) and returns.value is None:
-        return True
-
-    # -> None (as a name)
-    if isinstance(returns, ast.Name) and returns.id == "None":
-        return True
-
-    return False
+    return (
+        returns is None
+        or isinstance(returns, ast.Constant) and returns.value is None
+        or isinstance(returns, ast.Name) and returns.id == "None"
+    )
 
 
 def _is_tuple_type(node: ast.AST) -> bool:

From 3b0808e1498889d9e3f7ad8ae1624a13fa6bb992 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:47:48 +0100
Subject: [PATCH 15/20] Explain why super() is skipped

---
 utils/check_decorator_return_types.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py
index d64d958e385f..4b75f2dd8058 100644
--- a/utils/check_decorator_return_types.py
+++ b/utils/check_decorator_return_types.py
@@ -248,6 +248,9 @@ def check_decorator_return_types(overwrite: bool = False):
 
         for func_node, decorator_name in functions:
             # Ignore trivial delegations like `return super(...` or `super(...`.
+            # We skip these as this happens sometimes in modular files with methods that inherit their return
+            # type from another architecture. Then they'll have no explicit return type, but we'll test via
+            # the generated modeling file instead.
             if _is_delegating_to_super(func_node):
                 continue
 

From dbd43d60a1ddd6ba900b6c63ce99f33ad6f7d2dc Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:49:32 +0100
Subject: [PATCH 16/20] Add comments for ParamSpec/TypeVar linking to Python
 docs

---
 src/transformers/utils/generic.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 28d7c72e1975..1857f71263f2 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -33,6 +33,9 @@
 from .import_utils import is_mlx_available, is_torch_available, is_torch_fx_proxy, requires
 
 
+# See https://docs.python.org/3/library/typing.html#typing.ParamSpec for documentation on how ParamSpec
+# should be used alongside TypeVar and ParamSpecArgs (P.args) and ParamSpecKwargs (P.kwargs) for type
+# hinting decorators.
 P = ParamSpec("P")
 T = TypeVar("T")
 

From 688be4bc9aa2485c879aaf59611d4aad4366e930 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:49:47 +0100
Subject: [PATCH 17/20] Make style

---
 utils/check_decorator_return_types.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/utils/check_decorator_return_types.py b/utils/check_decorator_return_types.py
index 4b75f2dd8058..ca1e0af52dcd 100644
--- a/utils/check_decorator_return_types.py
+++ b/utils/check_decorator_return_types.py
@@ -89,8 +89,10 @@ def _decorator_name(node: ast.expr) -> str | None:
 def _is_none_annotation(returns: ast.expr | None) -> bool:
     return (
         returns is None
-        or isinstance(returns, ast.Constant) and returns.value is None
-        or isinstance(returns, ast.Name) and returns.id == "None"
+        or isinstance(returns, ast.Constant)
+        and returns.value is None
+        or isinstance(returns, ast.Name)
+        and returns.id == "None"
     )
 
 

From 3fd9571c96449256980f52d29d7883b1cd1eebef Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 23 Jan 2026 15:59:12 +0100
Subject: [PATCH 18/20] Move decorator typing comments around

---
 src/transformers/utils/generic.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 1857f71263f2..05c4615ce4c0 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -33,9 +33,7 @@
 from .import_utils import is_mlx_available, is_torch_available, is_torch_fx_proxy, requires
 
 
-# See https://docs.python.org/3/library/typing.html#typing.ParamSpec for documentation on how ParamSpec
-# should be used alongside TypeVar and ParamSpecArgs (P.args) and ParamSpecKwargs (P.kwargs) for type
-# hinting decorators.
+# Used to type hint decorators that modify the signature of the decorated function
 P = ParamSpec("P")
 T = TypeVar("T")
 
@@ -824,6 +822,8 @@ def del_attribute_from_modules(module: "torch.nn.Module", key: str):
         del_attribute_from_modules(submodule, key)
 
 
+# We follow the example from https://docs.python.org/3/library/typing.html#typing.ParamSpec to type-hint
+# this decorator, allowing it to add 'tuple' to the signature of the decorated function.
 def can_return_tuple(func: Callable[P, T]) -> Callable[P, tuple | T]:
     """
     Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or
@@ -870,6 +870,8 @@ class OutputRecorder:
     class_name: str | None = None
 
 
+# We follow the example from https://docs.python.org/3/library/typing.html#typing.ParamSpec to type-hint
+# this decorator, allowing it to add 'tuple' to the signature of the decorated function.
 def check_model_inputs(
     func: Callable[P, T] | None = None, *, tie_last_hidden_states: bool = True
 ) -> Callable[P, tuple | T]:

From bc81a600c595f4f42dc60e4f65dd1bf0e0762901 Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 20 Feb 2026 10:39:26 +0100
Subject: [PATCH 19/20] Rerun utils/check_decorator_return_types.py

---
 src/transformers/models/align/modeling_align.py        |  8 ++++----
 src/transformers/models/altclip/modeling_altclip.py    |  6 +++---
 src/transformers/models/blt/modular_blt.py             |  2 +-
 .../models/chinese_clip/modeling_chinese_clip.py       |  2 +-
 src/transformers/models/clap/modeling_clap.py          |  2 +-
 src/transformers/models/clipseg/modeling_clipseg.py    |  2 +-
 .../conditional_detr/modeling_conditional_detr.py      |  2 +-
 src/transformers/models/d_fine/modeling_d_fine.py      |  4 ++--
 .../models/deformable_detr/modeling_deformable_detr.py |  6 +++---
 .../models/deformable_detr/modular_deformable_detr.py  |  8 ++++----
 src/transformers/models/detr/modeling_detr.py          |  6 +++---
 .../models/exaone_moe/modeling_exaone_moe.py           |  2 +-
 .../models/florence2/modeling_florence2.py             |  4 +---
 src/transformers/models/florence2/modular_florence2.py |  4 +---
 src/transformers/models/gemma3n/modeling_gemma3n.py    |  2 +-
 src/transformers/models/gemma3n/modular_gemma3n.py     |  2 +-
 src/transformers/models/git/modeling_git.py            |  2 +-
 .../models/granite_speech/modeling_granite_speech.py   |  4 +---
 .../models/higgs_audio_v2/modeling_higgs_audio_v2.py   |  2 +-
 .../models/higgs_audio_v2/modular_higgs_audio_v2.py    |  2 +-
 src/transformers/models/idefics/vision.py              |  2 +-
 src/transformers/models/kosmos2/modeling_kosmos2.py    |  2 +-
 src/transformers/models/layoutlm/modeling_layoutlm.py  |  2 +-
 src/transformers/models/lw_detr/modeling_lw_detr.py    |  2 +-
 src/transformers/models/lw_detr/modular_lw_detr.py     |  2 +-
 src/transformers/models/markuplm/modeling_markuplm.py  |  2 +-
 src/transformers/models/mlcd/modeling_mlcd.py          |  2 +-
 .../modeling_moonshine_streaming.py                    |  2 +-
 .../moonshine_streaming/modular_moonshine_streaming.py |  2 +-
 .../models/pp_doclayout_v3/modeling_pp_doclayout_v3.py |  6 +++---
 .../models/pp_doclayout_v3/modular_pp_doclayout_v3.py  |  6 +++---
 src/transformers/models/qwen3_5/modeling_qwen3_5.py    |  8 ++++----
 src/transformers/models/qwen3_5/modular_qwen3_5.py     |  2 +-
 .../models/qwen3_5_moe/modeling_qwen3_5_moe.py         |  8 ++++----
 .../models/qwen3_omni_moe/modeling_qwen3_omni_moe.py   |  4 ++--
 .../models/qwen3_vl_moe/modeling_qwen3_vl_moe.py       |  2 +-
 .../models/qwen3_vl_moe/modular_qwen3_vl_moe.py        |  2 +-
 src/transformers/models/rt_detr/modeling_rt_detr.py    |  6 +++---
 src/transformers/models/rt_detr/modular_rt_detr.py     |  6 +++---
 .../models/rt_detr_v2/modeling_rt_detr_v2.py           |  6 +++---
 src/transformers/models/splinter/modeling_splinter.py  |  2 +-
 src/transformers/models/t5gemma2/modeling_t5gemma2.py  |  2 +-
 src/transformers/models/t5gemma2/modular_t5gemma2.py   |  2 +-
 .../modeling_vibevoice_acoustic_tokenizer.py           | 10 +++++++---
 .../modular_vibevoice_acoustic_tokenizer.py            | 10 +++++++---
 .../voxtral_realtime/modeling_voxtral_realtime.py      |  4 ++--
 .../voxtral_realtime/modular_voxtral_realtime.py       |  4 ++--
 src/transformers/models/x_clip/modeling_x_clip.py      |  2 +-
 48 files changed, 92 insertions(+), 90 deletions(-)

diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 211ce7aa6d9e..962ad6de9989 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -771,7 +771,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -897,7 +897,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         Examples:
 
@@ -1010,7 +1010,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPoolingAndNoAttention:
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
         r"""
         Examples:
 
@@ -1173,7 +1173,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | AlignOutput:
+    ) -> AlignOutput:
         r"""
         return_loss (`bool`, *optional*):
             Whether or not to return the contrastive loss.
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 9bb228715967..c8b1c1532bbc 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -384,7 +384,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
@@ -607,7 +607,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -1053,7 +1053,7 @@ def forward(
         return_dict: bool | None = None,
         output_hidden_states: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPoolingAndProjection:
+    ) -> BaseModelOutputWithPoolingAndProjection:
         r"""
         Examples:
 
diff --git a/src/transformers/models/blt/modular_blt.py b/src/transformers/models/blt/modular_blt.py
index 9232704341c2..539c99ef5670 100644
--- a/src/transformers/models/blt/modular_blt.py
+++ b/src/transformers/models/blt/modular_blt.py
@@ -1131,7 +1131,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | CausalLMOutputWithPast:
+    ) -> CausalLMOutputWithPast:
         r"""
         cross_attention_states (`torch.FloatTensor`, *optional*):
             Output of the vision model, used for cross-attention. This tensor contains the processed image features that
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index c2f60f03c14c..8c76be58c32c 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -631,7 +631,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 4f24a8f0aa99..f43091b83950 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -1259,7 +1259,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index bf179063b61a..b29d033ea59e 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -489,7 +489,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 2de83de19c12..6bf377619d40 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1674,7 +1674,7 @@ def forward(
         decoder_inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | ConditionalDetrSegmentationOutput:
+    ) -> ConditionalDetrSegmentationOutput:
         r"""
         decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`:
diff --git a/src/transformers/models/d_fine/modeling_d_fine.py b/src/transformers/models/d_fine/modeling_d_fine.py
index 1c758f8b1dcd..405d24cd8510 100644
--- a/src/transformers/models/d_fine/modeling_d_fine.py
+++ b/src/transformers/models/d_fine/modeling_d_fine.py
@@ -1627,7 +1627,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | DFineModelOutput:
+    ) -> DFineModelOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
@@ -1948,7 +1948,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | DFineObjectDetectionOutput:
+    ) -> DFineObjectDetectionOutput:
         r"""
         Example:
 
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 3ee685a887c1..41fa7b40d950 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -1035,7 +1035,7 @@ def forward(
         level_start_index=None,
         valid_ratios=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> DeformableDetrDecoderOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
@@ -1312,7 +1312,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         decoder_inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | DeformableDetrModelOutput:
+    ) -> DeformableDetrModelOutput:
         r"""
         decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Not used by default. Can be used to mask object queries.
@@ -1571,7 +1571,7 @@ def forward(
         decoder_inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | DeformableDetrObjectDetectionOutput:
+    ) -> DeformableDetrObjectDetectionOutput:
         r"""
         decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Not used by default. Can be used to mask object queries.
diff --git a/src/transformers/models/deformable_detr/modular_deformable_detr.py b/src/transformers/models/deformable_detr/modular_deformable_detr.py
index dfbc0783fb0a..213c5571edfd 100644
--- a/src/transformers/models/deformable_detr/modular_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modular_deformable_detr.py
@@ -696,7 +696,7 @@ def forward(
         level_start_index=None,
         valid_ratios=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
@@ -817,7 +817,7 @@ def forward(
         level_start_index=None,
         valid_ratios=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> DeformableDetrDecoderOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
@@ -1094,7 +1094,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         decoder_inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | DeformableDetrModelOutput:
+    ) -> DeformableDetrModelOutput:
         r"""
         decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Not used by default. Can be used to mask object queries.
@@ -1338,7 +1338,7 @@ def forward(
         decoder_inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | DeformableDetrObjectDetectionOutput:
+    ) -> DeformableDetrObjectDetectionOutput:
         r"""
         decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Not used by default. Can be used to mask object queries.
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 4906b3510f44..e29a17e3794b 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -1148,7 +1148,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         decoder_inputs_embeds: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | DetrModelOutput:
+    ) -> DetrModelOutput:
         r"""
         decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`:
@@ -1327,7 +1327,7 @@ def forward(
         decoder_inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | DetrObjectDetectionOutput:
+    ) -> DetrObjectDetectionOutput:
         r"""
         decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`:
@@ -1488,7 +1488,7 @@ def forward(
         decoder_inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | DetrSegmentationOutput:
+    ) -> DetrSegmentationOutput:
         r"""
         decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
             Mask to avoid performing attention on certain object queries in the decoder. Mask values selected in `[0, 1]`:
diff --git a/src/transformers/models/exaone_moe/modeling_exaone_moe.py b/src/transformers/models/exaone_moe/modeling_exaone_moe.py
index 35754ab51566..82c7a392a3ed 100644
--- a/src/transformers/models/exaone_moe/modeling_exaone_moe.py
+++ b/src/transformers/models/exaone_moe/modeling_exaone_moe.py
@@ -512,7 +512,7 @@ def forward(
         use_cache: bool | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index d92873f1aadf..18f5d9a3681b 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -555,9 +555,7 @@ def __init__(self, config: Florence2VisionConfig):
 
     @merge_with_config_defaults
     @capture_outputs
-    def forward(
-        self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling:
         for conv, block in zip(self.convs, self.blocks):
             hidden_states = conv(hidden_states)
             for layer in block:
diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py
index c5a826571b5a..18a504e1ba67 100644
--- a/src/transformers/models/florence2/modular_florence2.py
+++ b/src/transformers/models/florence2/modular_florence2.py
@@ -1406,9 +1406,7 @@ def __init__(self, config: Florence2VisionConfig):
 
     @merge_with_config_defaults
     @capture_outputs
-    def forward(
-        self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling:
         for conv, block in zip(self.convs, self.blocks):
             hidden_states = conv(hidden_states)
             for layer in block:
diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py
index ef4ea9602a7b..73889e0cadc6 100644
--- a/src/transformers/models/gemma3n/modeling_gemma3n.py
+++ b/src/transformers/models/gemma3n/modeling_gemma3n.py
@@ -1447,7 +1447,7 @@ def __init__(self, config: Gemma3nAudioConfig):
     @capture_outputs
     def forward(
         self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | Gemma3nAudioEncoderModelOutput:
+    ) -> Gemma3nAudioEncoderModelOutput:
         """Encodes a batch of MELs.
 
         Args:
diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py
index 74c006133d99..b038e2005116 100644
--- a/src/transformers/models/gemma3n/modular_gemma3n.py
+++ b/src/transformers/models/gemma3n/modular_gemma3n.py
@@ -1898,7 +1898,7 @@ def __init__(self, config: Gemma3nAudioConfig):
     @capture_outputs
     def forward(
         self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | Gemma3nAudioEncoderModelOutput:
+    ) -> Gemma3nAudioEncoderModelOutput:
         """Encodes a batch of MELs.
 
         Args:
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 15a0395a1962..5a1a973e1c2f 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -770,7 +770,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
index 1aeae5bf1381..d7c7f1642deb 100644
--- a/src/transformers/models/granite_speech/modeling_granite_speech.py
+++ b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -306,9 +306,7 @@ def __init__(self, config: GraniteSpeechEncoderConfig):
 
     @merge_with_config_defaults
     @capture_outputs
-    def forward(
-        self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    def forward(self, hidden_states: torch.Tensor, **kwargs: Unpack[TransformersKwargs]) -> BaseModelOutputWithPooling:
         hidden_states = self.input_linear(hidden_states)
         for idx, layer in enumerate(self.layers, start=1):
             hidden_states = layer(hidden_states, attention_dists=self.attention_dists)
diff --git a/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py
index f760bc611f80..f1a06f4221a9 100644
--- a/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py
+++ b/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py
@@ -675,7 +675,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> CausalLMOutputWithPast:
         r"""
         audio_input_ids (`torch.LongTensor` of shape `(batch_size, num_audio_frames, num_codebooks)`, *optional*):
             Indices of audio codebook tokens.
diff --git a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py
index d7bbce5248b2..646693131039 100644
--- a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py
+++ b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py
@@ -557,7 +557,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> CausalLMOutputWithPast:
         r"""
         audio_input_ids (`torch.LongTensor` of shape `(batch_size, num_audio_frames, num_codebooks)`, *optional*):
             Indices of audio codebook tokens.
diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py
index 2cc51ac34993..1352519de55a 100644
--- a/src/transformers/models/idefics/vision.py
+++ b/src/transformers/models/idefics/vision.py
@@ -356,7 +356,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py
index e7073019d8a3..eccb957c0c02 100644
--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@@ -448,7 +448,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs,
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 6f13f274f7d6..76fb26f1e640 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -326,7 +326,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
diff --git a/src/transformers/models/lw_detr/modeling_lw_detr.py b/src/transformers/models/lw_detr/modeling_lw_detr.py
index 08aeb6bcea67..44f9ce107794 100644
--- a/src/transformers/models/lw_detr/modeling_lw_detr.py
+++ b/src/transformers/models/lw_detr/modeling_lw_detr.py
@@ -1133,7 +1133,7 @@ def forward(
         encoder_hidden_states: torch.Tensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> LwDetrDecoderOutput:
         intermediate = ()
         intermediate_reference_points = (reference_points,)
 
diff --git a/src/transformers/models/lw_detr/modular_lw_detr.py b/src/transformers/models/lw_detr/modular_lw_detr.py
index d783f8adcacd..d2b0a96088e1 100644
--- a/src/transformers/models/lw_detr/modular_lw_detr.py
+++ b/src/transformers/models/lw_detr/modular_lw_detr.py
@@ -1109,7 +1109,7 @@ def forward(
         encoder_hidden_states: torch.Tensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> LwDetrDecoderOutput:
         intermediate = ()
         intermediate_reference_points = (reference_points,)
 
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index a89f526c7f71..8f1eafa11799 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -474,7 +474,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py
index 1e8f526f6168..a1f90104154d 100644
--- a/src/transformers/models/mlcd/modeling_mlcd.py
+++ b/src/transformers/models/mlcd/modeling_mlcd.py
@@ -479,7 +479,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
diff --git a/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py b/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py
index a58780b73252..123af47a918b 100644
--- a/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py
+++ b/src/transformers/models/moonshine_streaming/modeling_moonshine_streaming.py
@@ -819,7 +819,7 @@ def forward(
         encoder_hidden_states: torch.FloatTensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
diff --git a/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py b/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py
index 2f1f6058316a..3f68f2622766 100644
--- a/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py
+++ b/src/transformers/models/moonshine_streaming/modular_moonshine_streaming.py
@@ -363,7 +363,7 @@ def forward(
         encoder_hidden_states: torch.FloatTensor | None = None,
         encoder_attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPast:
+    ) -> BaseModelOutputWithPast:
         r"""
         encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
diff --git a/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py b/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py
index d4c275b93eed..17e8e6f34a0b 100644
--- a/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py
+++ b/src/transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py
@@ -1150,7 +1150,7 @@ def forward(
         norm=None,
         mask_feat=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> PPDocLayoutV3DecoderOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
@@ -1655,7 +1655,7 @@ def forward(
         encoder_outputs: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ModelOutput:
+    ) -> PPDocLayoutV3ModelOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
@@ -1977,7 +1977,7 @@ def forward(
         encoder_outputs: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ForObjectDetectionOutput:
+    ) -> PPDocLayoutV3ForObjectDetectionOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
diff --git a/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py b/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py
index bd5fe3b76857..fbe05c3f1f91 100644
--- a/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py
+++ b/src/transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py
@@ -978,7 +978,7 @@ def forward(
         norm=None,
         mask_feat=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> PPDocLayoutV3DecoderOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
@@ -1116,7 +1116,7 @@ def forward(
         encoder_outputs: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ModelOutput:
+    ) -> PPDocLayoutV3ModelOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
@@ -1435,7 +1435,7 @@ def forward(
         encoder_outputs: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | PPDocLayoutV3ForObjectDetectionOutput:
+    ) -> PPDocLayoutV3ForObjectDetectionOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
diff --git a/src/transformers/models/qwen3_5/modeling_qwen3_5.py b/src/transformers/models/qwen3_5/modeling_qwen3_5.py
index 96efb5838ca4..baefd277b499 100644
--- a/src/transformers/models/qwen3_5/modeling_qwen3_5.py
+++ b/src/transformers/models/qwen3_5/modeling_qwen3_5.py
@@ -1518,7 +1518,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1535,7 +1535,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1646,7 +1646,7 @@ def forward(
         video_grid_thw: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3_5ModelOutputWithPast:
+    ) -> Qwen3_5ModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -1887,7 +1887,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3_5CausalLMOutputWithPast:
+    ) -> Qwen3_5CausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py
index 2c3297386b91..e34cee3d41a3 100644
--- a/src/transformers/models/qwen3_5/modular_qwen3_5.py
+++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py
@@ -733,7 +733,7 @@ def forward(
         video_grid_thw: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3_5ModelOutputWithPast:
+    ) -> Qwen3_5ModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
diff --git a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py
index 8fbccbd23db1..19cc78f900c9 100644
--- a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py
+++ b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py
@@ -1643,7 +1643,7 @@ def get_video_features(
         pixel_values_videos: torch.FloatTensor,
         video_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input videos.
@@ -1660,7 +1660,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
             The tensors corresponding to the input images.
@@ -1771,7 +1771,7 @@ def forward(
         video_grid_thw: torch.LongTensor | None = None,
         cache_position: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3_5MoeModelOutputWithPast:
+    ) -> Qwen3_5MoeModelOutputWithPast:
         r"""
         image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
             The temporal, height and width of feature shape of each image in LLM.
@@ -2089,7 +2089,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3_5MoeCausalLMOutputWithPast:
+    ) -> Qwen3_5MoeCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
index 89aa791e2684..9667257c49d5 100644
--- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
@@ -1708,7 +1708,7 @@ def forward(
         visual_pos_masks: torch.Tensor | None = None,
         deepstack_visual_embeds: list[torch.Tensor] | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         r"""
         visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
             The mask of the visual positions.
@@ -2968,7 +2968,7 @@ def forward(
         visual_pos_masks: torch.Tensor | None = None,
         deepstack_visual_embeds: list[torch.Tensor] | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         r"""
         visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
             The mask of the visual positions.
diff --git a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
index dee473790fef..ea7dab69095e 100644
--- a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
+++ b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
@@ -933,7 +933,7 @@ def forward(
         visual_pos_masks: torch.Tensor | None = None,
         deepstack_visual_embeds: list[torch.Tensor] | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         r"""
         visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
             The mask of the visual positions.
diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
index b64dcb69a827..40fe39991171 100644
--- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
+++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
@@ -460,7 +460,7 @@ def forward(
         cache_position: torch.LongTensor | None = None,
         logits_to_keep: int | torch.Tensor = 0,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | Qwen3VLMoeCausalLMOutputWithPast:
+    ) -> Qwen3VLMoeCausalLMOutputWithPast:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
index 182d4b2c054a..abd8f0bf463d 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -1165,7 +1165,7 @@ def forward(
         spatial_shapes_list=None,
         level_start_index=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> RTDetrDecoderOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
@@ -1490,7 +1490,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | RTDetrModelOutput:
+    ) -> RTDetrModelOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
@@ -1715,7 +1715,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | RTDetrObjectDetectionOutput:
+    ) -> RTDetrObjectDetectionOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py
index f9289f9e6619..7d9cffa1b6b2 100644
--- a/src/transformers/models/rt_detr/modular_rt_detr.py
+++ b/src/transformers/models/rt_detr/modular_rt_detr.py
@@ -1274,7 +1274,7 @@ def forward(
         spatial_shapes_list=None,
         level_start_index=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> RTDetrDecoderOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
@@ -1476,7 +1476,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | RTDetrModelOutput:
+    ) -> RTDetrModelOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
@@ -1701,7 +1701,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | RTDetrObjectDetectionOutput:
+    ) -> RTDetrObjectDetectionOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
diff --git a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py
index b5244ffda7f8..13fd3c87dbf1 100644
--- a/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py
+++ b/src/transformers/models/rt_detr_v2/modeling_rt_detr_v2.py
@@ -588,7 +588,7 @@ def forward(
         spatial_shapes_list=None,
         level_start_index=None,
         **kwargs: Unpack[TransformersKwargs],
-    ):
+    ) -> RTDetrV2DecoderOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
@@ -1413,7 +1413,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | RTDetrV2ModelOutput:
+    ) -> RTDetrV2ModelOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
@@ -1747,7 +1747,7 @@ def forward(
         inputs_embeds: torch.FloatTensor | None = None,
         labels: list[dict] | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.FloatTensor] | RTDetrV2ObjectDetectionOutput:
+    ) -> RTDetrV2ObjectDetectionOutput:
         r"""
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 3825a8bfb99e..b8908e9035c7 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -297,7 +297,7 @@ def forward(
         output_hidden_states: bool | None = False,
         return_dict: bool | None = True,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor] | BaseModelOutput:
+    ) -> BaseModelOutput:
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
 
diff --git a/src/transformers/models/t5gemma2/modeling_t5gemma2.py b/src/transformers/models/t5gemma2/modeling_t5gemma2.py
index 54a6bef97262..3c255decd994 100644
--- a/src/transformers/models/t5gemma2/modeling_t5gemma2.py
+++ b/src/transformers/models/t5gemma2/modeling_t5gemma2.py
@@ -1256,7 +1256,7 @@ def get_decoder(self):
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         return self.get_encoder().get_image_features(pixel_values, **kwargs)
 
     @property
diff --git a/src/transformers/models/t5gemma2/modular_t5gemma2.py b/src/transformers/models/t5gemma2/modular_t5gemma2.py
index 1523a164a598..68b84b524288 100644
--- a/src/transformers/models/t5gemma2/modular_t5gemma2.py
+++ b/src/transformers/models/t5gemma2/modular_t5gemma2.py
@@ -1274,7 +1274,7 @@ def get_decoder(self):
     @auto_docstring
     def get_image_features(
         self, pixel_values: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         return self.get_encoder().get_image_features(pixel_values, **kwargs)
 
     @property
diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py
index a10fffa4c029..a957736be547 100644
--- a/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py
+++ b/src/transformers/models/vibevoice_acoustic_tokenizer/modeling_vibevoice_acoustic_tokenizer.py
@@ -529,7 +529,9 @@ def __init__(self, config):
 
     @can_return_tuple
     @auto_docstring
-    def encode(self, input_values, padding_cache=None, use_cache=None, sample=True):
+    def encode(
+        self, input_values, padding_cache=None, use_cache=None, sample=True
+    ) -> VibeVoiceAcousticTokenizerEncoderOutput:
         r"""
         input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`):
             Input audio waveform to be encoded into latent representation.
@@ -569,7 +571,7 @@ def encode(self, input_values, padding_cache=None, use_cache=None, sample=True):
 
     @can_return_tuple
     @auto_docstring
-    def decode(self, latents, padding_cache=None, use_cache=False):
+    def decode(self, latents, padding_cache=None, use_cache=False) -> VibeVoiceAcousticTokenizerDecoderOutput:
         r"""
         latents (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`):
             Input latent representation to be decoded back into audio.
@@ -604,7 +606,9 @@ def decode(self, latents, padding_cache=None, use_cache=False):
 
     @can_return_tuple
     @auto_docstring
-    def forward(self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs):
+    def forward(
+        self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs
+    ) -> VibeVoiceAcousticTokenizerOutput:
         r"""
         input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`):
             Input audio waveform to be encoded into latent representation.
diff --git a/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py b/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py
index 31ee1a2919bd..a7bab1326fb0 100644
--- a/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py
+++ b/src/transformers/models/vibevoice_acoustic_tokenizer/modular_vibevoice_acoustic_tokenizer.py
@@ -417,7 +417,9 @@ def __init__(self, config):
 
     @can_return_tuple
     @auto_docstring
-    def encode(self, input_values, padding_cache=None, use_cache=None, sample=True):
+    def encode(
+        self, input_values, padding_cache=None, use_cache=None, sample=True
+    ) -> VibeVoiceAcousticTokenizerEncoderOutput:
         r"""
         input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`):
             Input audio waveform to be encoded into latent representation.
@@ -457,7 +459,7 @@ def encode(self, input_values, padding_cache=None, use_cache=None, sample=True):
 
     @can_return_tuple
     @auto_docstring
-    def decode(self, latents, padding_cache=None, use_cache=False):
+    def decode(self, latents, padding_cache=None, use_cache=False) -> VibeVoiceAcousticTokenizerDecoderOutput:
         r"""
         latents (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`):
             Input latent representation to be decoded back into audio.
@@ -492,7 +494,9 @@ def decode(self, latents, padding_cache=None, use_cache=False):
 
     @can_return_tuple
     @auto_docstring
-    def forward(self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs):
+    def forward(
+        self, input_values, padding_cache=None, use_cache=False, sample=True, **kwargs
+    ) -> VibeVoiceAcousticTokenizerOutput:
         r"""
         input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`):
             Input audio waveform to be encoded into latent representation.
diff --git a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py
index 6212b61bd2a7..e5b9d2228df0 100644
--- a/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/modeling_voxtral_realtime.py
@@ -544,7 +544,7 @@ def forward(
         use_padding_cache: bool | None = None,
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         padding_cache (`VoxtralRealtimeConv1dPaddingCache`, *optional*):
             Cache for padding in convolutional layers to maintain state across streaming chunks.
@@ -1000,7 +1000,7 @@ def get_audio_features(
         past_key_values: Cache | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
diff --git a/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py
index 8dc007c8daaa..3c5f2a867911 100644
--- a/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/modular_voxtral_realtime.py
@@ -374,7 +374,7 @@ def forward(
         use_padding_cache: bool | None = None,
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         padding_cache (`VoxtralRealtimeConv1dPaddingCache`, *optional*):
             Cache for padding in convolutional layers to maintain state across streaming chunks.
@@ -610,7 +610,7 @@ def get_audio_features(
         past_key_values: Cache | None = None,
         use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         r"""
         input_features (`torch.FloatTensor`):
             Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index ca545f97bd6a..110bd10c5d63 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -565,7 +565,7 @@ def forward(
         output_hidden_states: bool | None = None,
         return_dict: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutput:
+    ) -> BaseModelOutput:
         r"""
         Args:
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):

From b05bc5d01e20e2fcf2b164888b611782eef1389b Mon Sep 17 00:00:00 2001
From: Tom Aarsen <Cubiegamedev@gmail.com>
Date: Fri, 20 Feb 2026 11:02:26 +0100
Subject: [PATCH 20/20] Update incorrect typings on modular classes that
 inherit decorators

---
 src/transformers/models/mlcd/modular_mlcd.py                 | 2 +-
 src/transformers/models/qwen3_5/modular_qwen3_5.py           | 4 ++--
 src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py
index 443acec5a513..4e2442b4e024 100644
--- a/src/transformers/models/mlcd/modular_mlcd.py
+++ b/src/transformers/models/mlcd/modular_mlcd.py
@@ -399,7 +399,7 @@ def forward(
         self,
         pixel_values: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py
index e34cee3d41a3..2f80eae3b8b7 100644
--- a/src/transformers/models/qwen3_5/modular_qwen3_5.py
+++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py
@@ -697,7 +697,7 @@ class Qwen3_5Model(Qwen3VLModel):
     def get_video_features(
         self,
         **super_kwargs,
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         # Same implementation as for images
         return super().get_video_features(**super_kwargs)
 
@@ -706,7 +706,7 @@ def get_image_features(
         pixel_values: torch.FloatTensor,
         image_grid_thw: torch.LongTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple | BaseModelOutputWithPooling:
+    ) -> BaseModelOutputWithPooling:
         pixel_values = pixel_values.type(self.visual.dtype)
         vision_output: BaseModelOutputWithPooling = self.visual(
             pixel_values, grid_thw=image_grid_thw, return_dict=True, **kwargs
diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
index 40fe39991171..24c86fdbe27c 100644
--- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
+++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
@@ -354,7 +354,7 @@ def forward(
         visual_pos_masks: torch.Tensor | None = None,
         deepstack_visual_embeds: list[torch.Tensor] | None = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple | MoeModelOutputWithPast:
+    ) -> MoeModelOutputWithPast:
         r"""
         visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
             The mask of the visual positions.