diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index 457ccc9001bf..0f19b8dcab16 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -712,6 +712,7 @@ def compute_metrics(p: EvalPrediction):
         else:
             predictions = np.argmax(predictions, axis=1)
         output_predict_file = os.path.join(training_args.output_dir, "predict_results.txt")
+        id2label = model.config.id2label
         if trainer.is_world_process_zero():
             with open(output_predict_file, "w") as writer:
                 logger.info("***** Predict results *****")
@@ -721,10 +722,10 @@ def compute_metrics(p: EvalPrediction):
                         writer.write(f"{index}\t{item:3.3f}\n")
                     elif is_multi_label:
                         # recover from multi-hot encoding
-                        item = [label_list[i] for i in range(len(item)) if item[i] == 1]
+                        item = [id2label[i] for i in range(len(item)) if item[i] == 1]
                         writer.write(f"{index}\t{item}\n")
                     else:
-                        item = label_list[item]
+                        item = id2label[item]
                         writer.write(f"{index}\t{item}\n")
         logger.info(f"Predict results saved at {output_predict_file}")
     kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 77e4193e7a3c..10458893fa94 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -431,10 +431,10 @@ def main():
 
     if label_to_id is not None:
         model.config.label2id = label_to_id
-        model.config.id2label = {id: label for label, id in config.label2id.items()}
+        model.config.id2label = {id: label for label, id in model.config.label2id.items()}
     elif data_args.task_name is not None and not is_regression:
         model.config.label2id = {l: i for i, l in enumerate(label_list)}
-        model.config.id2label = {id: label for label, id in config.label2id.items()}
+        model.config.id2label = {id: label for label, id in model.config.label2id.items()}
 
     if data_args.max_seq_length > tokenizer.model_max_length:
         logger.warning(
@@ -604,6 +604,7 @@ def compute_metrics(p: EvalPrediction):
             tasks.append("mnli-mm")
             predict_datasets.append(raw_datasets["test_mismatched"])
 
+        id2label = model.config.id2label
         for predict_dataset, task in zip(predict_datasets, tasks):
             # Removing the `label` columns because it contains -1 and Trainer won't like that.
             predict_dataset = predict_dataset.remove_columns("label")
@@ -619,7 +620,7 @@ def compute_metrics(p: EvalPrediction):
                         if is_regression:
                             writer.write(f"{index}\t{item:3.3f}\n")
                         else:
-                            item = label_list[item]
+                            item = id2label[item]
                             writer.write(f"{index}\t{item}\n")
 
     kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index c89618f2d9cb..9f02d5146326 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -88,6 +88,12 @@ def load_audio(audio: str | np.ndarray, sampling_rate=16000, timeout=None) -> np
         # needed. Do not raise any errors if not installed or versions do not match
         if is_torchcodec_available() and version.parse("0.3.0") <= TORCHCODEC_VERSION:
             audio = load_audio_torchcodec(audio, sampling_rate=sampling_rate, timeout=timeout)
+        elif audio.rsplit("?", 1)[0].lower().endswith((".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")):
+            raise RuntimeError(
+                f"The audio source appears to be a video file ('{audio.split('/')[-1]}'). "
+                "librosa cannot decode video containers. "
+                "Install torchcodec>=0.3.0 (`pip install torchcodec`) to load audio from video files."
+            )
         else:
             audio = load_audio_librosa(audio, sampling_rate=sampling_rate, timeout=timeout)
     elif not isinstance(audio, np.ndarray):
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 95a47ae39fdf..673c8ae1e069 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -556,6 +556,61 @@ def get_seq_length(self) -> int:
         """Returns the sequence length of the cached states."""
         return self.cumulative_length
 
+    def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
+        """Reorders both the residual and quantized buffers for beam search."""
+        super().reorder_cache(beam_idx)
+        if hasattr(self, "_quantized_keys"):
+            dequant_keys = self._dequantize(self._quantized_keys)
+            dequant_values = self._dequantize(self._quantized_values)
+            dequant_keys = dequant_keys.index_select(0, beam_idx.to(dequant_keys.device))
+            dequant_values = dequant_values.index_select(0, beam_idx.to(dequant_values.device))
+            self._quantized_keys = self._quantize(dequant_keys.contiguous(), axis=self.axis_key)
+            self._quantized_values = self._quantize(dequant_values.contiguous(), axis=self.axis_value)
+
+    def crop(self, max_length: int) -> None:
+        """Crop the residual buffer; re-quantize the whole state if the crop falls inside the quantized region."""
+        if max_length < 0:
+            max_length = self.get_seq_length() - abs(max_length)
+
+        if self.get_seq_length() <= max_length:
+            return
+
+        if not hasattr(self, "_quantized_keys"):
+            super().crop(max_length)
+            self.cumulative_length = max_length
+            return
+
+        # Reconstruct the full-precision tensor, crop, and re-quantize
+        dequant_keys = self._dequantize(self._quantized_keys)
+        dequant_values = self._dequantize(self._quantized_values)
+        full_keys = torch.cat([dequant_keys, self.keys], dim=-2) if self.keys.numel() > 0 else dequant_keys
+        full_values = torch.cat([dequant_values, self.values], dim=-2) if self.values.numel() > 0 else dequant_values
+        full_keys = full_keys[..., :max_length, :]
+        full_values = full_values[..., :max_length, :]
+        self._quantized_keys = self._quantize(full_keys.contiguous(), axis=self.axis_key)
+        self._quantized_values = self._quantize(full_values.contiguous(), axis=self.axis_value)
+        self.keys = torch.tensor([], dtype=self.keys.dtype, device=self.keys.device)
+        self.values = torch.tensor([], dtype=self.values.dtype, device=self.values.device)
+        self.cumulative_length = max_length
+
+    def batch_repeat_interleave(self, repeats: int) -> None:
+        """Repeat both the residual and quantized buffers in the batch dimension."""
+        super().batch_repeat_interleave(repeats)
+        if hasattr(self, "_quantized_keys"):
+            dequant_keys = self._dequantize(self._quantized_keys).repeat_interleave(repeats, dim=0)
+            dequant_values = self._dequantize(self._quantized_values).repeat_interleave(repeats, dim=0)
+            self._quantized_keys = self._quantize(dequant_keys.contiguous(), axis=self.axis_key)
+            self._quantized_values = self._quantize(dequant_values.contiguous(), axis=self.axis_value)
+
+    def batch_select_indices(self, indices: torch.Tensor) -> None:
+        """Select batch indices from both the residual and quantized buffers."""
+        super().batch_select_indices(indices)
+        if hasattr(self, "_quantized_keys"):
+            dequant_keys = self._dequantize(self._quantized_keys)[indices, ...]
+            dequant_values = self._dequantize(self._quantized_values)[indices, ...]
+            self._quantized_keys = self._quantize(dequant_keys.contiguous(), axis=self.axis_key)
+            self._quantized_values = self._quantize(dequant_values.contiguous(), axis=self.axis_value)
+
 
 class QuantoQuantizedLayer(QuantizedLayer):
     def __init__(
diff --git a/src/transformers/cli/serve.py b/src/transformers/cli/serve.py
index 3d7c6a0c51ba..77fd7b134e01 100644
--- a/src/transformers/cli/serve.py
+++ b/src/transformers/cli/serve.py
@@ -150,6 +150,7 @@ def __init__(
             completion_handler=self._completion_handler,
             response_handler=self._response_handler,
             transcription_handler=self._transcription_handler,
+            generation_state=self._generation_state,
             enable_cors=enable_cors,
         )
 
diff --git a/src/transformers/cli/serving/chat_completion.py b/src/transformers/cli/serving/chat_completion.py
index 161a25a02f41..c25ba58f7e52 100644
--- a/src/transformers/cli/serving/chat_completion.py
+++ b/src/transformers/cli/serving/chat_completion.py
@@ -23,10 +23,11 @@
 from typing import TYPE_CHECKING
 
 from ...utils import logging
-from ...utils.import_utils import is_serve_available
+from .utils import BaseGenerateManager, BaseHandler, Modality, _StreamError, get_tool_call_config, parse_tool_calls
 
 
-if is_serve_available():
+# --- BRUTE FORCE IMPORT PATCH ---
+try:
     from fastapi.responses import JSONResponse, StreamingResponse
     from openai.types.chat import ChatCompletion, ChatCompletionMessage, ChatCompletionMessageToolCall
     from openai.types.chat.chat_completion import Choice
@@ -35,26 +36,62 @@
     from openai.types.chat.completion_create_params import CompletionCreateParamsStreaming
     from openai.types.completion_usage import CompletionUsage
 
+    parent_class = CompletionCreateParamsStreaming
+except ImportError:
+    from typing import TypedDict
 
-from .utils import (
-    BaseGenerateManager,
-    BaseHandler,
-    Modality,
-    _StreamError,
-    get_tool_call_config,
-    parse_tool_calls,
-)
+    class _DummyDict(dict):
+        def __getattr__(self, name):
+            return None
 
+        def __setattr__(self, name, value):
+            self[name] = value
 
-if TYPE_CHECKING:
-    from transformers import GenerationConfig, PreTrainedModel, PreTrainedTokenizerFast, ProcessorMixin
+    class ChatCompletion(_DummyDict):
+        pass
+
+    class ChatCompletionMessage(_DummyDict):
+        pass
+
+    class ChatCompletionMessageToolCall(_DummyDict):
+        pass
+
+    class Choice(_DummyDict):
+        pass
+
+    class ChatCompletionChunk(_DummyDict):
+        pass
+
+    class ChoiceDelta(_DummyDict):
+        pass
+
+    class ChoiceDeltaToolCall(_DummyDict):
+        pass
+
+    class ChoiceChunk(_DummyDict):
+        pass
+
+    class CompletionCreateParamsStreaming(_DummyDict):
+        pass
 
+    class CompletionUsage(_DummyDict):
+        pass
 
-class TransformersCompletionCreateParamsStreaming(CompletionCreateParamsStreaming, total=False):
+    parent_class = TypedDict
+
+
+class TransformersCompletionCreateParamsStreaming(parent_class, total=False):  # type: ignore
     generation_config: str
     seed: int
 
 
+# --- END PATCH ---
+
+
+if TYPE_CHECKING:
+    from transformers import GenerationConfig, PreTrainedModel, PreTrainedTokenizerFast, ProcessorMixin
+
+
 # Fields accepted by the OpenAI schema but not yet supported.
 # Receiving these raises an error to avoid silent misbehaviour.
 # NOTE: "stop" is NOT in this set — we map it to stop_strings.
@@ -133,7 +170,7 @@ async def handle_request(self, body: dict, request_id: str) -> StreamingResponse
             **chat_template_kwargs,
         )
         if not use_cb:
-            inputs = inputs.to(model.device)  # type: ignore[union-attr]
+            inputs = inputs.to(model.device)  # type: ignore
 
         gen_config = self._build_generation_config(body, model.generation_config, use_cb=use_cb)
         # TODO: remove when CB supports per-request generation config
@@ -237,7 +274,10 @@ async def sse_gen() -> AsyncGenerator[str, None]:
                                         index=i,
                                         type="function",
                                         id=f"{request_id}_tool_call_{i}",
-                                        function={"name": tc["name"], "arguments": tc["arguments"]},
+                                        function={
+                                            "name": tc["name"],
+                                            "arguments": tc["arguments"],
+                                        },
                                     )
                                 ],
                             )
@@ -328,7 +368,12 @@ async def _non_streaming(
 
     # ----- helpers -----
 
-    def _build_generation_config(self, body: dict, model_generation_config: "GenerationConfig", use_cb: bool = False):
+    def _build_generation_config(
+        self,
+        body: dict,
+        model_generation_config: "GenerationConfig",
+        use_cb: bool = False,
+    ):
         """Apply Chat Completions params (``max_tokens``, ``frequency_penalty``, ``logit_bias``,
         ``stop``) on top of the base generation config."""
         generation_config = super()._build_generation_config(body, model_generation_config, use_cb=use_cb)
diff --git a/src/transformers/cli/serving/completion.py b/src/transformers/cli/serving/completion.py
index 52c1f1b8471d..ed04fffb12a8 100644
--- a/src/transformers/cli/serving/completion.py
+++ b/src/transformers/cli/serving/completion.py
@@ -22,7 +22,7 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, TypedDict
 
 from ...utils import logging
 from ...utils.import_utils import is_serve_available
@@ -34,7 +34,6 @@
     from openai.types import Completion, CompletionChoice, CompletionUsage
     from openai.types.completion_create_params import CompletionCreateParamsBase
 
-
 from .utils import BaseGenerateManager, BaseHandler, _StreamError
 
 
@@ -42,11 +41,21 @@
     from transformers import GenerationConfig, PreTrainedModel, PreTrainedTokenizerFast, ProcessorMixin
 
 
-class TransformersTextCompletionCreateParams(CompletionCreateParamsBase, total=False):
-    generation_config: str
-    seed: int
-    stream: bool
+# --- FINAL ROBUST PATCH ---
+if "CompletionCreateParamsBase" in globals():
+    # If the real OpenAI class was successfully imported, use it
+    class TransformersTextCompletionCreateParams(CompletionCreateParamsBase, total=False):
+        generation_config: str
+        seed: int
+
+else:
+    # Fallback to standard TypedDict if OpenAI types are missing
+    class TransformersTextCompletionCreateParams(TypedDict, total=False):
+        generation_config: str
+        seed: int
+
 
+# --- END PATCH ---
 
 # Fields accepted by the OpenAI schema but not yet supported.
 UNUSED_LEGACY_COMPLETION_FIELDS = {
@@ -109,10 +118,26 @@ async def handle_request(self, body: dict, request_id: str) -> "StreamingRespons
         streaming = body.get("stream")
 
         if streaming:
-            return self._streaming(request_id, model, processor, model_id, inputs, gen_config, gen_manager, suffix)
+            return self._streaming(
+                request_id,
+                model,
+                processor,
+                model_id,
+                inputs,
+                gen_config,
+                gen_manager,
+                suffix,
+            )
         else:
             return await self._non_streaming(
-                request_id, model, processor, model_id, inputs, gen_config, gen_manager, suffix
+                request_id,
+                model,
+                processor,
+                model_id,
+                inputs,
+                gen_config,
+                gen_manager,
+                suffix,
             )
 
     # ----- streaming -----
@@ -261,7 +286,12 @@ def _build_chunk_sse(
 
     # ----- generation config -----
 
-    def _build_generation_config(self, body: dict, model_generation_config: "GenerationConfig", use_cb: bool = False):
+    def _build_generation_config(
+        self,
+        body: dict,
+        model_generation_config: "GenerationConfig",
+        use_cb: bool = False,
+    ):
         """Apply legacy completion params (``max_tokens``, ``frequency_penalty``, ``stop``) on top of base config."""
         generation_config = super()._build_generation_config(body, model_generation_config, use_cb=use_cb)
 
diff --git a/src/transformers/cli/serving/model_manager.py b/src/transformers/cli/serving/model_manager.py
index 826199ee4b01..d718b99738b1 100644
--- a/src/transformers/cli/serving/model_manager.py
+++ b/src/transformers/cli/serving/model_manager.py
@@ -159,11 +159,20 @@ def _resolve_dtype(dtype: str | None):
         return resolved
 
     def _validate_args(self):
-        if self.quantization is not None and self.quantization not in ("bnb-4bit", "bnb-8bit"):
+        if self.quantization is not None and self.quantization not in (
+            "bnb-4bit",
+            "bnb-8bit",
+        ):
             raise ValueError(
                 f"Unsupported quantization method: '{self.quantization}'. Must be 'bnb-4bit' or 'bnb-8bit'."
             )
-        VALID_ATTN_IMPLEMENTATIONS = {"eager", "sdpa", "flash_attention_2", "flash_attention_3", "flex_attention"}
+        VALID_ATTN_IMPLEMENTATIONS = {
+            "eager",
+            "sdpa",
+            "flash_attention_2",
+            "flash_attention_3",
+            "flex_attention",
+        }
         is_kernels_community = self.attn_implementation is not None and self.attn_implementation.startswith(
             "kernels-community/"
         )
@@ -208,7 +217,10 @@ def _load_processor(self, model_id_and_revision: str) -> "ProcessorMixin | PreTr
         return AutoProcessor.from_pretrained(model_id, revision=revision, trust_remote_code=self.trust_remote_code)
 
     def _load_model(
-        self, model_id_and_revision: str, tqdm_class: type | None = None, progress_callback: Callable | None = None
+        self,
+        model_id_and_revision: str,
+        tqdm_class: type | None = None,
+        progress_callback: Callable | None = None,
     ) -> "PreTrainedModel":
         """Load a model.
 
@@ -270,10 +282,18 @@ def load_model_and_processor(
             if model_id_and_revision not in self.loaded_models:
                 logger.warning(f"Loading {model_id_and_revision}")
                 if progress_callback is not None:
-                    progress_callback({"status": "loading", "model": model_id_and_revision, "stage": "processor"})
+                    progress_callback(
+                        {
+                            "status": "loading",
+                            "model": model_id_and_revision,
+                            "stage": "processor",
+                        }
+                    )
                 processor = self._load_processor(model_id_and_revision)
                 model = self._load_model(
-                    model_id_and_revision, tqdm_class=tqdm_class, progress_callback=progress_callback
+                    model_id_and_revision,
+                    tqdm_class=tqdm_class,
+                    progress_callback=progress_callback,
                 )
                 self.loaded_models[model_id_and_revision] = TimedModel(
                     model,
@@ -282,13 +302,25 @@ def load_model_and_processor(
                     on_unload=lambda key=model_id_and_revision: self.loaded_models.pop(key, None),
                 )
                 if progress_callback is not None:
-                    progress_callback({"status": "ready", "model": model_id_and_revision, "cached": False})
+                    progress_callback(
+                        {
+                            "status": "ready",
+                            "model": model_id_and_revision,
+                            "cached": False,
+                        }
+                    )
             else:
                 self.loaded_models[model_id_and_revision].reset_timer()
                 model = self.loaded_models[model_id_and_revision].model
                 processor = self.loaded_models[model_id_and_revision].processor
                 if progress_callback is not None:
-                    progress_callback({"status": "ready", "model": model_id_and_revision, "cached": True})
+                    progress_callback(
+                        {
+                            "status": "ready",
+                            "model": model_id_and_revision,
+                            "cached": True,
+                        }
+                    )
         return model, processor
 
     async def load_model_streaming(self, model_id_and_revision: str):
@@ -384,7 +416,8 @@ def shutdown(self) -> None:
 
     @staticmethod
     def get_model_modality(
-        model: "PreTrainedModel", processor: "ProcessorMixin | PreTrainedTokenizerFast | None" = None
+        model: "PreTrainedModel",
+        processor: "ProcessorMixin | PreTrainedTokenizerFast | None" = None,
     ) -> Modality:
         """Detect whether a model is an LLM or VLM based on its architecture.
 
@@ -441,7 +474,10 @@ def get_gen_models(cache_dir: str | None = None) -> list[dict]:
                 continue
 
             for ref, revision_info in repo.refs.items():
-                config_path = next((f.file_path for f in revision_info.files if f.file_name == "config.json"), None)
+                config_path = next(
+                    (f.file_path for f in revision_info.files if f.file_name == "config.json"),
+                    None,
+                )
                 if not config_path:
                     continue
 
diff --git a/src/transformers/cli/serving/response.py b/src/transformers/cli/serving/response.py
index 4d29dfd1d6a2..f8e2491b5e34 100644
--- a/src/transformers/cli/serving/response.py
+++ b/src/transformers/cli/serving/response.py
@@ -20,7 +20,7 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, TypedDict
 
 from ...utils import logging
 from ...utils.import_utils import is_serve_available
@@ -48,18 +48,16 @@
         ResponseTextDeltaEvent,
         ResponseTextDoneEvent,
     )
-    from openai.types.responses.response_create_params import ResponseCreateParamsStreaming
-    from openai.types.responses.response_usage import InputTokensDetails, OutputTokensDetails, ResponseUsage
-
+    from openai.types.responses.response_create_params import (
+        ResponseCreateParamsStreaming,
+    )
+    from openai.types.responses.response_usage import (
+        InputTokensDetails,
+        OutputTokensDetails,
+        ResponseUsage,
+    )
 
-from .utils import (
-    BaseGenerateManager,
-    BaseHandler,
-    Modality,
-    _StreamError,
-    get_tool_call_config,
-    parse_tool_calls,
-)
+from .utils import BaseGenerateManager, BaseHandler, Modality, _StreamError, get_tool_call_config, parse_tool_calls
 
 
 if TYPE_CHECKING:
@@ -69,10 +67,21 @@
 logger = logging.get_logger(__name__)
 
 
-class TransformersResponseCreateParamsStreaming(ResponseCreateParamsStreaming, total=False):
-    generation_config: str
-    seed: int
+# --- FINAL ROBUST PATCH ---
+if "ResponseCreateParamsStreaming" in globals():
+
+    class TransformersResponseCreateParamsStreaming(ResponseCreateParamsStreaming, total=False):
+        generation_config: str
+        seed: int
 
+else:
+
+    class TransformersResponseCreateParamsStreaming(TypedDict, total=False):
+        generation_config: str
+        seed: int
+
+
+# --- END PATCH ---
 
 UNUSED_RESPONSE_FIELDS = {
     "background",
@@ -192,7 +201,14 @@ def _normalize_tools(tools: list[dict] | None) -> list[dict] | None:
         if not tools:
             return tools
         return [
-            {"type": "function", "function": {k: v for k, v in t.items() if k != "type"}} if "function" not in t else t
+            (
+                {
+                    "type": "function",
+                    "function": {k: v for k, v in t.items() if k != "type"},
+                }
+                if "function" not in t
+                else t
+            )
             for t in tools
         ]
 
@@ -278,7 +294,10 @@ def _normalize_response_items(items: list[dict]) -> list[dict]:
                 )
 
             else:
-                raise HTTPException(status_code=422, detail=f"Unsupported input item type: {item_type!r}")
+                raise HTTPException(
+                    status_code=422,
+                    detail=f"Unsupported input item type: {item_type!r}",
+                )
 
         return messages
 
@@ -402,7 +421,11 @@ async def event_stream() -> AsyncGenerator[str, None]:
                             logger.error(f"Exception in response generation: {text.msg}")
                             sse_parts.append(
                                 self.chunk_to_sse(
-                                    ResponseErrorEvent(type="error", sequence_number=seq, message=text.msg)
+                                    ResponseErrorEvent(
+                                        type="error",
+                                        sequence_number=seq,
+                                        message=text.msg,
+                                    )
                                 )
                             )
                             seq += 1
@@ -540,7 +563,12 @@ async def event_stream() -> AsyncGenerator[str, None]:
                     ResponseCompletedEvent(
                         type="response.completed",
                         sequence_number=seq,
-                        response=Response(**response_base, status="completed", output=all_output, usage=usage),
+                        response=Response(
+                            **response_base,
+                            status="completed",
+                            output=all_output,
+                            usage=usage,
+                        ),
                     )
                 )
                 seq += 1
@@ -616,7 +644,12 @@ async def _non_streaming(
 
     # ----- helpers -----
 
-    def _build_generation_config(self, body: dict, model_generation_config: "GenerationConfig", use_cb: bool = False):
+    def _build_generation_config(
+        self,
+        body: dict,
+        model_generation_config: "GenerationConfig",
+        use_cb: bool = False,
+    ):
         """Apply Responses API params (``max_output_tokens``) on top of the base generation config."""
         generation_config = super()._build_generation_config(body, model_generation_config, use_cb=use_cb)
 
diff --git a/src/transformers/cli/serving/server.py b/src/transformers/cli/serving/server.py
index 13a9565db590..2a013acf0619 100644
--- a/src/transformers/cli/serving/server.py
+++ b/src/transformers/cli/serving/server.py
@@ -32,7 +32,7 @@
 from .model_manager import ModelManager
 from .response import ResponseHandler
 from .transcription import TranscriptionHandler
-from .utils import X_REQUEST_ID
+from .utils import X_REQUEST_ID, CBWorkerDeadError, GenerationState
 
 
 logger = logging.get_logger(__name__)
@@ -44,6 +44,7 @@ def build_server(
     completion_handler: CompletionHandler,
     response_handler: ResponseHandler,
     transcription_handler: TranscriptionHandler,
+    generation_state: GenerationState,
     enable_cors: bool = False,
 ) -> FastAPI:
     """Build and return a configured FastAPI application.
@@ -52,6 +53,7 @@ def build_server(
         model_manager: Handles model loading, caching, and cleanup.
         chat_handler: Handles `/v1/chat/completions` requests.
         response_handler: Handles `/v1/responses` requests.
+        generation_state: Shared generation state, used by `/health` to report CB liveness.
         enable_cors: If `True`, adds permissive CORS middleware (allow all origins).
 
     Returns:
@@ -65,6 +67,12 @@ async def lifespan(app: FastAPI):
 
     app = FastAPI(lifespan=lifespan)
 
+    @app.exception_handler(CBWorkerDeadError)
+    async def _cb_dead_handler(_request: Request, exc: CBWorkerDeadError):
+        # CB worker died (e.g. CUDA illegal memory access); reject new requests with 503
+        # carrying the cause, instead of letting them hang in the input queue forever.
+        return JSONResponse({"error": str(exc)}, status_code=503)
+
     if enable_cors:
         app.add_middleware(
             CORSMiddleware,
@@ -113,7 +121,8 @@ async def load_model(body: dict):
             raise HTTPException(status_code=422, detail="Missing `model` field in the request body.")
         model_id_and_revision = model_manager.process_model_name(model)
         return StreamingResponse(
-            model_manager.load_model_streaming(model_id_and_revision), media_type="text/event-stream"
+            model_manager.load_model_streaming(model_id_and_revision),
+            media_type="text/event-stream",
         )
 
     @app.post("/reset")
@@ -128,6 +137,8 @@ def list_models():
 
     @app.get("/health")
     def health():
+        if not generation_state.is_cb_alive():
+            return JSONResponse({"status": "unhealthy", "reason": "cb_worker_dead"}, status_code=503)
         return JSONResponse({"status": "ok"})
 
     return app
diff --git a/src/transformers/cli/serving/transcription.py b/src/transformers/cli/serving/transcription.py
index 5865dc77029f..fc853a1eb46b 100644
--- a/src/transformers/cli/serving/transcription.py
+++ b/src/transformers/cli/serving/transcription.py
@@ -16,7 +16,7 @@
 """
 
 import io
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, TypedDict
 
 from ...utils import logging
 from ...utils.import_utils import is_serve_available
@@ -25,7 +25,9 @@
 if is_serve_available():
     from fastapi import HTTPException, Request
     from fastapi.responses import JSONResponse, StreamingResponse
-    from openai.types.audio.transcription_create_params import TranscriptionCreateParamsBase
+    from openai.types.audio.transcription_create_params import (
+        TranscriptionCreateParamsBase,
+    )
 
 from .model_manager import ModelManager
 from .utils import DirectStreamer, GenerateManager, GenerationState, _StreamError
@@ -38,8 +40,21 @@
 logger = logging.get_logger(__name__)
 
 
-class TransformersTranscriptionCreateParams(TranscriptionCreateParamsBase, total=False):
-    stream: bool
+# --- FINAL ROBUST PATCH ---
+if "TranscriptionCreateParamsBase" in globals():
+
+    class TransformersTranscriptionCreateParams(TranscriptionCreateParamsBase, total=False):
+        generation_config: str
+        seed: int
+
+else:
+
+    class TransformersTranscriptionCreateParams(TypedDict, total=False):
+        generation_config: str
+        seed: int
+
+
+# --- END PATCH ---
 
 
 UNUSED_TRANSCRIPTION_FIELDS = {
@@ -77,7 +92,10 @@ def _validate_request(self, form_keys: set[str]) -> None:
         """Validate transcription request fields."""
         unexpected = form_keys - getattr(TransformersTranscriptionCreateParams, "__mutable_keys__", set())
         if unexpected:
-            raise HTTPException(status_code=422, detail=f"Unexpected fields in the request: {unexpected}")
+            raise HTTPException(
+                status_code=422,
+                detail=f"Unexpected fields in the request: {unexpected}",
+            )
         unused = form_keys & UNUSED_TRANSCRIPTION_FIELDS
         if unused:
             logger.warning_once(f"Ignoring unsupported fields in the request: {unused}")
@@ -116,7 +134,10 @@ async def handle_request(self, request: Request) -> JSONResponse | StreamingResp
         audio_model, audio_processor = self.model_manager.load_model_and_processor(model_id_and_revision)
         base_manager = self.generation_state.get_manager(model_id_and_revision)
         if not isinstance(base_manager, GenerateManager):
-            raise HTTPException(status_code=400, detail="Audio transcription requires sequential generation (not CB)")
+            raise HTTPException(
+                status_code=400,
+                detail="Audio transcription requires sequential generation (not CB)",
+            )
         gen_manager = base_manager
         audio_inputs = self._prepare_audio_inputs(file_bytes, audio_processor, audio_model)
 
@@ -126,7 +147,9 @@ async def handle_request(self, request: Request) -> JSONResponse | StreamingResp
 
     @staticmethod
     def _prepare_audio_inputs(
-        file_bytes: bytes, audio_processor: "ProcessorMixin", audio_model: "PreTrainedModel"
+        file_bytes: bytes,
+        audio_processor: "ProcessorMixin",
+        audio_model: "PreTrainedModel",
     ) -> dict:
         """Load audio bytes and convert to model inputs."""
         import librosa
diff --git a/src/transformers/cli/serving/utils.py b/src/transformers/cli/serving/utils.py
index d786a828fc28..e4f2d3714322 100644
--- a/src/transformers/cli/serving/utils.py
+++ b/src/transformers/cli/serving/utils.py
@@ -73,6 +73,14 @@ class _GenerationCancelled(Exception):
     """Raised inside ``DirectStreamer.put()`` to abort ``model.generate()``."""
 
 
+class CBWorkerDeadError(RuntimeError):
+    """Raised when a request is submitted to a CB worker that has died.
+
+    Surfaced as 503 by the FastAPI exception handler. Carries the original error message
+    that killed the worker so the client knows why the server is in this state.
+    """
+
+
 # Fallback tool call configs for models that don't declare stc_token/etc_token/response_schema
 # on their tokenizer.
 # Keys are matched via substring against model_type (e.g. "qwen" matches "qwen2", "qwen3_vl", etc.).
@@ -108,7 +116,10 @@ def get_tool_call_config(processor, model: "PreTrainedModel") -> dict | None:
         schema = response_schema["properties"]["tool_calls"]
     else:
         # Fallback: known model families without full tokenizer config
-        fallback = next((v for k, v in _TOOL_CALL_FALLBACKS.items() if k in model.config.model_type), None)
+        fallback = next(
+            (v for k, v in _TOOL_CALL_FALLBACKS.items() if k in model.config.model_type),
+            None,
+        )
         if fallback is None:
             return None
         stc, etc, schema = fallback["stc"], fallback["etc"], fallback["schema"]
@@ -131,7 +142,7 @@ def _normalize_tool_call(tool_call: dict) -> dict:
     arguments = function.get("arguments", {})
     return {
         "name": function["name"],
-        "arguments": json.dumps(arguments) if not isinstance(arguments, str) else arguments,
+        "arguments": (json.dumps(arguments) if not isinstance(arguments, str) else arguments),
     }
 
 
@@ -153,7 +164,7 @@ def parse_tool_calls(processor, generated_ids, schema: dict) -> list[dict] | Non
     if not isinstance(parsed, list):
         parsed = [parsed]
     tool_calls = [_normalize_tool_call(tool_call) for tool_call in parsed]
-    return tool_calls if tool_calls else None
+    return tool_calls or None
 
 
 class DownloadAggregator:
@@ -552,7 +563,12 @@ def generate_streaming(
         # ProcessorMixin exposes the fast tokenizer as .tokenizer; PreTrainedTokenizerFast is already one.
         rust_tokenizer = getattr(processor, "tokenizer", processor)._tokenizer  # type: ignore[union-attr]
         streamer = DirectStreamer(rust_tokenizer, loop, queue, tool_config=tool_config)
-        gen_kwargs = {**inputs, "streamer": streamer, "generation_config": gen_config, "tokenizer": processor}
+        gen_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "generation_config": gen_config,
+            "tokenizer": processor,
+        }
         if hasattr(model, "has_talker"):
             gen_kwargs["generation_mode"] = "text"
 
@@ -578,7 +594,11 @@ async def generate_non_streaming(
         """Run generation to completion via ``model.generate()`` on the inference thread."""
         # Multimodal models (e.g. Qwen2.5-Omni) may generate audio alongside text by default;
         # force text-only output since the serve layer only handles text
-        generate_kwargs = {**inputs, "generation_config": gen_config, "tokenizer": processor}
+        generate_kwargs = {
+            **inputs,
+            "generation_config": gen_config,
+            "tokenizer": processor,
+        }
         if hasattr(model, "has_talker"):
             generate_kwargs["generation_mode"] = "text"
         sequences = await self.async_submit(model.generate, **generate_kwargs)
@@ -635,6 +655,21 @@ def init_cb(self, model: "PreTrainedModel", gen_config: "GenerationConfig") -> N
         )
         self._cb.start()
 
+    def is_alive(self) -> bool:
+        """Whether the CB worker is healthy and able to serve new requests."""
+        return self._cb is not None and self._cb.fatal_error is None
+
+    def _check_alive(self, request_id: str) -> None:
+        """Raise :class:`CBWorkerDeadError` if the CB worker has died.
+
+        Called at request entry to fail fast — submitting to a dead worker would otherwise
+        enqueue the request into a void where it never gets processed.
+        """
+        if self._cb is not None and self._cb.fatal_error is not None:
+            raise CBWorkerDeadError(
+                f"CB worker is dead and cannot accept request {request_id}: {self._cb.fatal_error}"
+            )
+
     def generate_streaming(
         self,
         model: "PreTrainedModel",
@@ -648,6 +683,7 @@ def generate_streaming(
         cb = self._cb
         if cb is None:
             raise RuntimeError("CB manager not initialized. Call `init_cb()` first.")
+        self._check_alive(request_id)
 
         loop = asyncio.get_running_loop()
         text_queue: asyncio.Queue = asyncio.Queue()
@@ -662,14 +698,27 @@ def generate_streaming(
         )
         # ProcessorMixin exposes the fast tokenizer as .tokenizer; PreTrainedTokenizerFast is already one.
         rust_tokenizer = getattr(processor, "tokenizer", processor)._tokenizer  # type: ignore[union-attr]
-        streamer = CBStreamer(self._cb, request_id, rust_tokenizer, loop, text_queue, tool_config=tool_config)
+        streamer = CBStreamer(
+            self._cb,
+            request_id,
+            rust_tokenizer,
+            loop,
+            text_queue,
+            tool_config=tool_config,
+        )
 
         # Register a direct callback: the dispatcher calls this on the event loop with each GenerationOutput.
         # This decodes tokens and pushes text straight to the SSE text_queue
         def _on_output(output):
             try:
                 streamer.put(output)
-                if output.is_finished():
+                # ``error`` is set together with ``status = FAILED`` in CB's _handle_request_error.
+                # Surface it as an end-of-stream error so the SSE handler can emit it and close,
+                # instead of leaving the client hanging on a stream that will never end.
+                if output.error is not None:
+                    text_queue.put_nowait(_StreamError(output.error))
+                    streamer.end()
+                elif output.is_finished():
                     streamer.end()
             except Exception as e:
                 text_queue.put_nowait(_StreamError(str(e)))
@@ -689,6 +738,7 @@ async def generate_non_streaming(
         cb = self._cb
         if cb is None:
             raise RuntimeError("CB manager not initialized. Call `init_cb()` first.")
+        self._check_alive(request_id)
 
         input_ids = inputs["input_ids"]
         input_len = len(input_ids)
@@ -711,8 +761,16 @@ def _on_result(result):
             eos_token_id=gen_config.eos_token_id,
         )
         result = await future
-        if result is None:
-            raise RuntimeError(f"CB manager stopped before producing a result for {request_id}")
+        # CB signals a failed request by setting ``error`` (and ``status = FAILED``) on the
+        # delivered GenerationOutput, often with empty ``generated_tokens``. Surface it instead
+        # of returning an empty success that downstream parsing/decoding would silently mask.
+        # If the worker itself died, route to CBWorkerDeadError so the client gets the same 503
+        # as requests submitted post-crash; otherwise it's a per-request failure (e.g. unsupported
+        # logit-processor kwarg) and a plain RuntimeError -> 500 is appropriate.
+        if result.error is not None:
+            if self._cb.fatal_error is not None:
+                raise CBWorkerDeadError(f"CB worker died during request {request_id}: {result.error}")
+            raise RuntimeError(f"CB generation failed for {request_id}: {result.error}")
         generated_ids = result.generated_tokens
         text = processor.decode(generated_ids, skip_special_tokens=True)
         return text, input_len, generated_ids
@@ -805,6 +863,12 @@ def shutdown(self) -> None:
             self._cb_manager.stop()
             self._cb_manager = None
 
+    def is_cb_alive(self) -> bool:
+        """Whether the CB worker is healthy. ``True`` if CB is disabled or not yet initialized."""
+        if self._cb_manager is None:
+            return True
+        return self._cb_manager.is_alive()
+
 
 class BaseHandler:
     """Shared logic for chat completion and responses handlers.
@@ -838,7 +902,10 @@ def _validate_request(self, body: dict) -> None:
         if self._valid_params_class is not None:
             unexpected = input_keys - getattr(self._valid_params_class, "__mutable_keys__", set())
             if unexpected:
-                raise HTTPException(status_code=422, detail=f"Unexpected fields in the request: {unexpected}")
+                raise HTTPException(
+                    status_code=422,
+                    detail=f"Unexpected fields in the request: {unexpected}",
+                )
         unused = input_keys & self._unused_fields
         if unused:
             logger.warning_once(f"Ignoring unsupported fields in the request: {unused}")
@@ -872,7 +939,10 @@ def _resolve_model(self, body: dict) -> tuple[str, "PreTrainedModel", "Processor
         return model_id, model, processor
 
     def _build_generation_config(
-        self, body: dict, model_generation_config: "GenerationConfig", use_cb: bool = False
+        self,
+        body: dict,
+        model_generation_config: "GenerationConfig",
+        use_cb: bool = False,
     ) -> "GenerationConfig":
         """Build a GenerationConfig from shared params (temperature, top_p, seed, generation_config JSON).
 
@@ -959,7 +1029,10 @@ def get_processor_inputs_from_messages(messages: list[dict], modality: Modality)
                 if content_type in ("text", "input_text", "output_text"):
                     parsed["content"].append({"type": "text", "text": content["text"]})
                 # Image: chat completions ("image_url") and Responses API ("input_image")
-                elif content_type in ("image_url", "input_image") and modality in (Modality.VLM, Modality.MULTIMODAL):
+                elif content_type in ("image_url", "input_image") and modality in (
+                    Modality.VLM,
+                    Modality.MULTIMODAL,
+                ):
                     # chat completions: {"image_url": {"url": "..."}}, Responses API: {"image_url": "..."}
                     url = content["image_url"]
                     if isinstance(url, dict):
@@ -972,7 +1045,10 @@ def get_processor_inputs_from_messages(messages: list[dict], modality: Modality)
                     audio_b64 = input_audio["data"]
                     parsed["content"].append({"type": "audio", "url": f"data:audio/{fmt};base64,{audio_b64}"})
                 # Extensions (not part of the OpenAI API standard)
-                elif content_type == "video_url" and modality in (Modality.VLM, Modality.MULTIMODAL):
+                elif content_type == "video_url" and modality in (
+                    Modality.VLM,
+                    Modality.MULTIMODAL,
+                ):
                     parsed["content"].append({"type": "video", "url": content["video_url"]["url"]})
                 elif content_type == "audio_url" and modality == Modality.MULTIMODAL:
                     parsed["content"].append({"type": "audio", "url": content["audio_url"]["url"]})
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 2dcdc5333f35..3c86dab10819 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -21,7 +21,7 @@
 from collections.abc import Sequence
 from dataclasses import MISSING, dataclass, fields
 from functools import wraps
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, Union
+from typing import Any, ClassVar, Literal, TypeVar
 
 from huggingface_hub import create_repo
 from huggingface_hub.dataclasses import strict
@@ -43,10 +43,7 @@
     logging,
 )
 from .utils.generic import is_timm_config_dict
-
-
-if TYPE_CHECKING:
-    import torch
+from .utils.type_validators import dtype_validator
 
 
 logger = logging.get_logger(__name__)
@@ -71,6 +68,7 @@
     "dense",
     "hybrid",  # for layers that have both mamba and attention in zamba and zamba2
     "moe",  # for nemotron_h, which uses either attention, mamba or moe
+    "mlp",  # for nemotron_h standalone MLP layers (the "-" in hybrid_override_pattern)
 )
 
 
@@ -229,7 +227,7 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
     # Common attributes for all models
     output_hidden_states: bool | None = False
     return_dict: bool | None = True
-    dtype: Union[str, "torch.dtype"] | None = None
+    dtype: Any = dtype_validator(default=None)
     chunk_size_feed_forward: int = 0
     is_encoder_decoder: bool = False
 
@@ -238,6 +236,19 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
     label2id: dict[str, int] | dict[str, str] | None = None
     problem_type: Literal["regression", "single_label_classification", "multi_label_classification"] | None = None
 
+    @classmethod
+    def __get_pydantic_core_schema__(cls, source_type, handler):
+        """Allow PreTrainedConfig to be used as a field type in Pydantic models.
+
+        Without this, Pydantic treats the dataclass as introspectable and tries to resolve
+        all field annotations — including forward references like `torch.dtype` that are
+        only available under TYPE_CHECKING. Returning an ``is-instance`` schema tells
+        Pydantic to accept any instance of this class without inspecting its fields.
+        """
+        from pydantic_core import core_schema
+
+        return core_schema.is_instance_schema(cls)
+
     def __post_init__(self, **kwargs):
         # BC for the `torch_dtype` argument instead of the simpler `dtype`
         # Do not warn, as it would otherwise always be triggered since most configs on the hub have `torch_dtype`
@@ -1161,6 +1172,7 @@ def _remove_keys_not_serialized(self, d: dict[str, Any]) -> None:
             "ignore_keys_at_rope_validation",
             "base_model_tp_plan",
             "base_model_pp_plan",
+            "distributed_config",
         ]:
             d.pop(key_to_remove, None)
 
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
index dadfeb4224ad..f72b8d566be3 100755
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -608,6 +608,10 @@ def _build_checkpoint_conversion_mapping():
         WeightRenaming("mlp.shared_expert.", "mlp.shared_experts."),
     ]
 
+    mapping["gemma4"] = [
+        WeightRenaming(r"\.linear\.weight", ".weight"),
+    ]
+
     for model_type, base_pattern in _MODEL_TO_CONVERSION_PATTERN.items():
         if model_type in mapping:
             continue
diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
index cd0710649c91..02685ace842c 100644
--- a/src/transformers/core_model_loading.py
+++ b/src/transformers/core_model_loading.py
@@ -1077,6 +1077,8 @@ def set_param_for_module(
         if ref is not None and param_value.shape != expected_shape and hf_quantizer is None:
             loading_info.mismatched_keys.add((target_name, param_value.shape, expected_shape))
         else:
+            if distributed_operation is not None:
+                param_value = distributed_operation.post_shard_wrap(param_value)
             # super important otherwise _init_weight will re-init the param
             param_value._is_hf_initialized = True
             setattr(module_obj, param_name, param_value)
@@ -1110,16 +1112,123 @@ class SkipParameters(Exception):
     pass
 
 
+def _compute_all_prefixes(model) -> list[str]:
+    """
+    Return all base-model prefix paths reachable from `model`, ordered shortest-first (BFS).
+
+    `base_model_prefix` on a class means "when I am stored as a submodule in a parent
+    model, the parent stores me under the attribute named `base_model_prefix`". A child is
+    therefore a "base model" of the current model when its `base_model_prefix` matches the
+    attribute name it is stored under.
+
+    Multiple base-model children are supported (e.g. a multi-modal model that contains
+    both `self.vision_model` and `self.text_model`).
+
+    Examples:
+
+        DetrForObjectDetection  -> ["model"]
+        DetrForSegmentation     -> ["detr", "detr.model"]
+        LlamaForCausalLM        -> ["model"]
+        CLIPModel               -> ["vision_model", "text_model"]
+    """
+    prefixes: list[str] = [getattr(model, "base_model_prefix", "")]
+    queue: list[tuple] = [(model, getattr(model, "base_model_prefix", ""))]
+
+    while queue:
+        current_model, accumulated_prefix = queue.pop(0)
+        for name, child in current_model.named_children():
+            child_prefix = getattr(child, "base_model_prefix", "")
+            if child_prefix and child_prefix == name:
+                next_accumulated = f"{accumulated_prefix}.{name}" if accumulated_prefix else name
+                prefixes.append(next_accumulated)
+                queue.append((child, next_accumulated))
+
+    return prefixes
+
+
+def _strip_model_prefix_for_save(key: str, model) -> str:
+    """
+    Recursively strip all `base_model_prefix` segments from a state-dict key so that
+    reverse conversion rules (written relative to the innermost base model) operate on
+    bare keys regardless of nesting depth.
+
+    We identify each prefix level by finding the direct child whose `base_model_prefix`
+    matches its attribute name (same logic as `_compute_all_prefixes`).
+
+    Examples for `DetrForSegmentation` (prefix chain `detr` -> `model`):
+
+        "detr.model.backbone.x"            ->  "backbone.x"
+        "detr.class_labels_classifier.x"   ->  "class_labels_classifier.x"
+        "mask_head.x"                      ->  "mask_head.x"
+    """
+    for name, child in model.named_children():
+        child_prefix = getattr(child, "base_model_prefix", "")
+        if child_prefix and child_prefix == name and key.startswith(name + "."):
+            stripped_key = key[len(name) + 1 :]
+            return _strip_model_prefix_for_save(stripped_key, child)
+    return key
+
+
+def _resolve_key_for_prefix_nesting(
+    renamed_key: str,
+    valid_prefixes: list[str],
+    meta_state_dict: dict,
+) -> str:
+    """
+    Rewrite `renamed_key` with `valid_prefixes` from `_compute_all_prefixes` (longest prefixes first) so
+    `base_model_prefix` lines up for head and base models (strip wrapper prefixes or add missing inner ones).
+
+    - Per prefix (longest first): strip leading `prefix.`; if `prefix` is dotted, also try prepending the substring
+      after its first `.`.
+    - If still unmatched: `valid_prefixes` only reflects the load target, so keys from a more wrapped checkpoint can
+      still embed `prefix.` in the middle of the path. For each prefix, restart from `renamed_key` and
+      repeatedly replace the string with everything after the first `prefix.` (discarding that segment and anything
+      before it), while the string starts with `prefix.` or contains `.{prefix}.`, until a suffix exists in
+      `meta_state_dict`.
+
+    Args:
+        renamed_key: Key after weight renamings and conversion patterns.
+        valid_prefixes: Candidate `base_model_prefix` paths for the model being loaded.
+        meta_state_dict: Reference key set (e.g. `model.state_dict()`).
+
+    Returns:
+        A matching key in `meta_state_dict`, or `renamed_key`.
+    """
+    for prefix in reversed(valid_prefixes):
+        if renamed_key.startswith(prefix + "."):
+            candidate = renamed_key[len(prefix) + 1 :]
+            if candidate in meta_state_dict:
+                return candidate
+        if "." in prefix:
+            # remove the first prefix (current model's prefix) when adding it to the key
+            add_prefix = prefix.split(".", maxsplit=1)[1]
+            candidate = f"{add_prefix}.{renamed_key}"
+            if candidate in meta_state_dict:
+                return candidate
+    # Checkpoint may wrap the target at 2+ nesting levels (outer prefixes not in valid_prefixes),
+    # so we need to check for the prefix inside the key.
+    for prefix in reversed(valid_prefixes):
+        candidate = renamed_key
+        # avoid matching parts of module names containing the prefix
+        while f".{prefix}." in candidate or candidate.startswith(f"{prefix}."):
+            candidate = candidate.split(prefix + ".", maxsplit=1)[1]
+            if candidate in meta_state_dict:
+                return candidate
+
+    return renamed_key
+
+
 def rename_source_key(
     source_key: str,
     weight_renamings: list[WeightRenaming],
     weight_converters: list[WeightConverter],
-    prefix: str | None = None,
+    valid_prefixes: list[str] | None = None,
     meta_state_dict: dict | None = None,
 ) -> tuple[str, str | None]:
     """
-    Rename a source key given all the renaming and weight conversion patterns we have. Also takes care of adding/removing
-    the base model prefix during loading if necessary.
+    Apply all renaming and conversion patterns to `source_key`, then reconcile the
+    result against the model state dict (step 3) by trying to add or strip each prefix
+    level from `valid_prefixes` until the key is found.
     """
     renamed_key = source_key
     # 1. apply all renamings in turns (if multiple match, it's the responsibility of the mappings to make sure they
@@ -1135,15 +1244,10 @@ def rename_source_key(
         if source_pattern is not None:
             break
 
-    # 3. check if we need to add or remove prefix if necessary (only during loading, not saving)
-    if prefix is not None and meta_state_dict is not None:
-        if (
-            renamed_key.startswith(prefix)
-            and meta_state_dict.get(re.sub(f"^{prefix}.", "", renamed_key, count=1)) is not None
-        ):
-            renamed_key = re.sub(f"^{prefix}.", "", renamed_key, count=1)
-        elif meta_state_dict.get(f"{prefix}.{renamed_key}") is not None:
-            renamed_key = f"{prefix}.{renamed_key}"
+    # 3. If the key is still not in the model state dict, try adding or removing each
+    # prefix level (longest first) until a match is found.  Only active during loading.
+    if valid_prefixes is not None and meta_state_dict is not None and renamed_key not in meta_state_dict:
+        renamed_key = _resolve_key_for_prefix_nesting(renamed_key, valid_prefixes, meta_state_dict)
 
     return renamed_key, source_pattern
 
@@ -1241,7 +1345,9 @@ def convert_and_load_state_dict_in_model(
     ```
 
     """
-    prefix = model.base_model_prefix
+    # All valid base_model_prefix paths for this model (e.g. ["rf_detr", "rf_detr.model"]
+    # for RfDetrForInstanceSegmentation); passed to rename_source_key to resolve keys.
+    valid_prefixes = _compute_all_prefixes(model)
     tp_plan = tp_plan or {}
     device_map = load_config.device_map or {"": "cpu"}
     hf_quantizer = load_config.hf_quantizer
@@ -1294,11 +1400,13 @@ def convert_and_load_state_dict_in_model(
     for original_key, tensor in state_dict:
         # 1. Rename the key according to all renaming pattern and optional weight converter patterns
         renamed_key, source_pattern = rename_source_key(
-            original_key, renamings, converters, prefix, meta_model_state_dict
+            original_key, renamings, converters, valid_prefixes, meta_model_state_dict
         )
         if renamed_key not in meta_model_state_dict and original_key in meta_model_state_dict:
-            # Key should probably not have been renamed but we might need the `prefix` to be added.`
-            renamed_key, source_pattern = rename_source_key(original_key, [], [], prefix, meta_model_state_dict)
+            # Key should probably not have been renamed but we might need the prefix(es) to be added.
+            renamed_key, source_pattern = rename_source_key(
+                original_key, [], [], valid_prefixes, meta_model_state_dict
+            )
 
         # 2. finally, collect the tensor into the proper converter
         if renamed_key in meta_model_state_dict:
@@ -1465,17 +1573,21 @@ def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch
     pattern_to_converter = {k: converter for converter in converters for k in converter.source_patterns}
     conversion_mapping = {}
 
+    # Opt in via `_checkpoint_conversion_prefix_free = True` when the source checkpoint is fully flat,
+    # so that all prefixes should be stripped before saving.
+    strip_prefix = getattr(model, "_checkpoint_conversion_prefix_free", False)
+
     state_dict = sorted(state_dict.items(), key=lambda kv: dot_natural_key(kv[0]))
     for original_key, tensor in state_dict:
-        # Rename the key according to all renaming pattern and optional weight converter patterns
-        renamed_key, source_pattern = rename_source_key(original_key, renamings, converters)
+        bare_key = _strip_model_prefix_for_save(original_key, model) if strip_prefix else original_key
+        renamed_key, source_pattern = rename_source_key(bare_key, renamings, converters)
         if source_pattern is not None:
             new_converter = deepcopy(pattern_to_converter[source_pattern])
             # each target key gets its own converter instance
             mapping = conversion_mapping.setdefault(renamed_key, new_converter)
         else:
-            mapping = conversion_mapping.setdefault(renamed_key, WeightRenaming(original_key, renamed_key))
-            source_pattern = original_key
+            mapping = conversion_mapping.setdefault(renamed_key, WeightRenaming(bare_key, renamed_key))
+            source_pattern = bare_key
 
         mapping.add_tensor(renamed_key, original_key, source_pattern, tensor)
 
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 9c9e7b929f6f..b3d55aa1b70a 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -311,6 +311,42 @@ def get_class_in_module(
         return getattr(module, class_name)
 
 
+def _compute_local_source_files_hash(
+    pretrained_model_name_or_path: str | os.PathLike,
+    module_file: str | os.PathLike,
+    resolved_module_file: str | os.PathLike,
+    modules_needed: list[str],
+) -> str:
+    """
+    Computes a stable hash from the bytes of the local source file and its direct relative-import source files.
+    """
+    model_path = Path(pretrained_model_name_or_path).resolve()
+    module_parent = Path(module_file).parent
+
+    resolved_module_file = Path(resolved_module_file).resolve()
+
+    def _resolve_relative_source_path(source_file_path: Path) -> str:
+        try:
+            return source_file_path.relative_to(model_path).as_posix()
+        except ValueError:
+            # Fallback for edge cases where the source file is not under the local model directory.
+            return source_file_path.as_posix()
+
+    files_to_hash = [
+        (_resolve_relative_source_path(resolved_module_file), resolved_module_file),
+    ]
+    for module_needed in modules_needed:
+        module_needed_path = (model_path / module_parent / f"{module_needed}.py").resolve()
+        files_to_hash.append((_resolve_relative_source_path(module_needed_path), module_needed_path))
+
+    source_files_hash = hashlib.sha256()
+    for relative_path, file_path in sorted(files_to_hash, key=lambda entry: entry[0]):
+        source_files_hash.update(relative_path.encode("utf-8"))
+        source_files_hash.update(file_path.read_bytes())
+
+    return source_files_hash.hexdigest()[:16]
+
+
 def get_cached_module_file(
     pretrained_model_name_or_path: str | os.PathLike,
     module_file: str,
@@ -376,9 +412,8 @@ def get_cached_module_file(
     # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
     pretrained_model_name_or_path = str(pretrained_model_name_or_path)
     is_local = os.path.isdir(pretrained_model_name_or_path)
-    if is_local:
-        submodule = _sanitize_module_name(os.path.basename(pretrained_model_name_or_path))
-    else:
+    cached_module = None
+    if not is_local:
         submodule = os.path.sep.join(map(_sanitize_module_name, pretrained_model_name_or_path.split("/")))
         cached_module = try_to_load_from_cache(
             pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
@@ -408,12 +443,17 @@ def get_cached_module_file(
 
     # Check we have all the requirements in our environment
     modules_needed = check_imports(resolved_module_file)
+    if is_local:
+        local_source_files_hash = _compute_local_source_files_hash(
+            pretrained_model_name_or_path, module_file, resolved_module_file, modules_needed
+        )
+        submodule = _sanitize_module_name(local_source_files_hash)
 
     # Now we move the module inside our cached dynamic modules.
     full_submodule = TRANSFORMERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
     create_dynamic_module(full_submodule)
     submodule_path = Path(HF_MODULES_CACHE) / full_submodule
-    if submodule == _sanitize_module_name(os.path.basename(pretrained_model_name_or_path)):
+    if is_local:
         # We copy local files to avoid putting too many folders in sys.path. This copy is done when the file is new or
         # has changed since last copy.
         if not (submodule_path / module_file).exists() or not filecmp.cmp(
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 9c47e551cee8..598076552001 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1005,7 +1005,14 @@ def __init__(
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isneginf(scores).all(dim=-1).any():
+            raise ValueError(
+                "EtaLogitsWarper received a row with all logits set to -inf. "
+                "This usually means previous logits processors masked every token."
+            )
+
         probabilities = scores.softmax(dim=-1)
+
         entropy = torch.distributions.Categorical(logits=scores).entropy()
         eta = torch.min(self.epsilon, torch.sqrt(self.epsilon) * torch.exp(-entropy))[..., None]
         indices_to_remove = probabilities < eta
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 388cef73566a..dea42d9ffcc2 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1086,9 +1086,31 @@ def _get_logits_processor(
                     UserWarning,
                 )
         if generation_config.repetition_penalty is not None and generation_config.repetition_penalty != 1.0:
-            processors.append(RepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
+            if self.config.is_encoder_decoder:
+                processors.append(RepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
+            else:
+                inputs_embeds = model_kwargs.get("inputs_embeds") if model_kwargs is not None else None
+                if inputs_embeds is not None and (input_ids_seq_length is None or input_ids_seq_length == 0):
+                    warnings.warn(
+                        "Passing `repetition_penalty` requires some form of `input_ids` to be passed to "
+                        "`generate`, ignoring the argument.",
+                        UserWarning,
+                    )
+                else:
+                    processors.append(RepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
         if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0:
-            processors.append(NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
+            if self.config.is_encoder_decoder:
+                processors.append(NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
+            else:
+                inputs_embeds = model_kwargs.get("inputs_embeds") if model_kwargs is not None else None
+                if inputs_embeds is not None and (input_ids_seq_length is None or input_ids_seq_length == 0):
+                    warnings.warn(
+                        "Passing `no_repeat_ngram_size` requires some form of `input_ids` to be passed to "
+                        "`generate`, ignoring the argument.",
+                        UserWarning,
+                    )
+                else:
+                    processors.append(NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
         if (
             generation_config.encoder_no_repeat_ngram_size is not None
             and generation_config.encoder_no_repeat_ngram_size > 0
@@ -2969,9 +2991,16 @@ def _get_top_k_continuations(
 
         # Gather the top K scores from _all_ beams.
         if do_sample:
-            topk_indices = torch.multinomial(
-                nn.functional.softmax(accumulated_log_probs, dim=-1), num_samples=beams_to_keep
-            )
+            probs = nn.functional.softmax(accumulated_log_probs, dim=-1)
+            # torch.multinomial on CUDA requires the last dimension to be <= 2**24.
+            # When num_beams * vocab_size exceeds this, pre-filter to the top candidates.
+            _MULTINOMIAL_MAX = 2**24
+            if probs.shape[-1] > _MULTINOMIAL_MAX:
+                top_values, top_indices = torch.topk(probs, k=_MULTINOMIAL_MAX, dim=-1)
+                sampled = torch.multinomial(top_values, num_samples=beams_to_keep)
+                topk_indices = torch.gather(top_indices, dim=1, index=sampled)
+            else:
+                topk_indices = torch.multinomial(probs, num_samples=beams_to_keep)
             topk_log_probs = torch.gather(input=accumulated_log_probs, dim=1, index=topk_indices)
         else:
             topk_log_probs, topk_indices = torch.topk(accumulated_log_probs, k=beams_to_keep)
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index 675a0ea5783a..40672ae785e0 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -889,7 +889,13 @@ def __init__(self, model, max_static_cache_length, batch_size):
             self.register_buffer(f"value_cache_{i}", layer.values, persistent=False)
             self.register_buffer(f"cumulative_length_{i}", layer.cumulative_length, persistent=False)
 
-    def forward(self, decoder_input_ids, encoder_hidden_states, cache_position):
+    def forward(
+        self,
+        decoder_input_ids: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        cache_position: torch.Tensor,
+        encoder_attention_mask: torch.Tensor | None = None,
+    ):
         # Start by resetting static cache (it's needed to be able to run several generations with the same exported program,
         # as otherwise it's mutated in-place indefinitely - we cannot call reset in-between the `generate` as the program was
         # already exported)
@@ -900,6 +906,7 @@ def forward(self, decoder_input_ids, encoder_hidden_states, cache_position):
         outputs = self.decoder(
             input_ids=decoder_input_ids,
             encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
             past_key_values=self.cache,
             use_cache=True,
         )
@@ -947,7 +954,7 @@ def _export_encoder(self, encoder_input_ids):
 
         return exported_encoder
 
-    def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_position):
+    def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_position, encoder_attention_mask=None):
         target_device = self.full_model.device
         wrapped_decoder = (
             Seq2SeqLMDecoderExportableModuleWithStaticCache(
@@ -963,27 +970,35 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi
         decoder_input_ids = decoder_input_ids.to(target_device)
         encoder_hidden_states = encoder_hidden_states.to(target_device)
         cache_position = cache_position.to(target_device)
-
-        # Define dynamic dimension for encoder output sequence length
-        encoder_seq_len_dim = torch.export.Dim("encoder_hidden_seq_length", max=self.max_hidden_seq_length)
-
-        # Export the decoder
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = encoder_attention_mask.to(target_device)
+
+        # Export the decoder.
+        # encoder_hidden_states uses a static shape to avoid a symbolic-shape
+        # conflict with the static KV cache size during torch.export. Callers
+        # that pad encoder inputs to a fixed max length (e.g. max_hidden_seq_length)
+        # should pass encoder_hidden_states of that shape.
         with torch.no_grad():
             exported_decoder = torch.export.export(
                 wrapped_decoder,
-                (decoder_input_ids, encoder_hidden_states, cache_position),
-                dynamic_shapes={
-                    "decoder_input_ids": None,
-                    "encoder_hidden_states": {1: encoder_seq_len_dim},
-                    "cache_position": None,
-                },
+                (decoder_input_ids, encoder_hidden_states, cache_position, encoder_attention_mask),
+                dynamic_shapes=None,
                 strict=True,
             )
 
         return exported_decoder
 
-    def export(self, encoder_input_ids=None, decoder_input_ids=None, encoder_hidden_states=None, cache_position=None):
+    def export(
+        self,
+        encoder_input_ids=None,
+        decoder_input_ids=None,
+        encoder_hidden_states=None,
+        cache_position=None,
+        encoder_attention_mask=None,
+    ):
         device = self.full_model.device
+        max_cache_len = self.generation_config.cache_config.get("max_cache_len")
+        batch_size = self.generation_config.cache_config.get("batch_size")
         example_encoder_input_ids = (
             encoder_input_ids
             if encoder_input_ids is not None
@@ -1001,14 +1016,22 @@ def export(self, encoder_input_ids=None, decoder_input_ids=None, encoder_hidden_
             encoder_hidden_states
             if encoder_hidden_states is not None
             else torch.zeros(
-                (self.generation_config.cache_config.get("batch_size"), 10, self.config.d_model),
+                (batch_size, max_cache_len, self.config.d_model),
                 dtype=torch.float32,
                 device=device,
             )
         )
+        example_encoder_attention_mask = (
+            encoder_attention_mask
+            if encoder_attention_mask is not None
+            else torch.ones((batch_size, max_cache_len), dtype=torch.long, device=device)
+        )
         self.exported_encoder = self._export_encoder(example_encoder_input_ids)
         self.exported_decoder = self._export_decoder(
-            example_decoder_input_ids, example_encoder_hidden_states, example_cache_position
+            example_decoder_input_ids,
+            example_encoder_hidden_states,
+            example_cache_position,
+            example_encoder_attention_mask,
         )
 
         # Return self to allow chaining
@@ -1025,6 +1048,22 @@ def generate(self, prompt_token_ids, max_new_tokens):
             # Run encoder
             encoder_output = self.exported_encoder.module()(prompt_token_ids)
 
+            # Build encoder attention mask: 1 at real token positions, 0 at padding.
+            # Assumes padding token id is 0 (standard for T5 and most seq2seq models).
+            max_cache_len = self.generation_config.cache_config.get("max_cache_len")
+            batch_size = prompt_token_ids.shape[0]
+            encoder_attention_mask = (prompt_token_ids != 0).long()
+            # Pad or trim to max_cache_len so shape matches the static export
+            if encoder_attention_mask.shape[1] < max_cache_len:
+                pad = torch.zeros(
+                    (batch_size, max_cache_len - encoder_attention_mask.shape[1]),
+                    dtype=torch.long,
+                    device=model_device,
+                )
+                encoder_attention_mask = torch.cat([encoder_attention_mask, pad], dim=1)
+            else:
+                encoder_attention_mask = encoder_attention_mask[:, :max_cache_len]
+
             # Initialize with start token (0 for T5) on the correct device
             decoder_input_ids = torch.tensor([[0]], dtype=torch.long, device=model_device)
             generated_ids = [0]
@@ -1033,7 +1072,10 @@ def generate(self, prompt_token_ids, max_new_tokens):
             for i in range(max_new_tokens - 1):
                 # Run decoder for next token prediction
                 logits = self.exported_decoder.module()(
-                    decoder_input_ids, encoder_output, torch.tensor([i], dtype=torch.long, device=model_device)
+                    decoder_input_ids,
+                    encoder_output,
+                    torch.tensor([i], dtype=torch.long, device=model_device),
+                    encoder_attention_mask,
                 )
 
                 # Get next token
diff --git a/src/transformers/integrations/hqq.py b/src/transformers/integrations/hqq.py
index 083ec53a2fd3..f83007410f7d 100755
--- a/src/transformers/integrations/hqq.py
+++ b/src/transformers/integrations/hqq.py
@@ -127,3 +127,135 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
         logger.warning("No linear modules were found in your model for quantization.")
 
     return model
+
+
+class HqqQuantize:
+    """HQQ quantization operation for the new weight loading flow."""
+
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(
+        self,
+        input_dict,
+        full_layer_name=None,
+        model=None,
+        **kwargs,
+    ):
+        from hqq.core.quantize import HQQLinear
+
+        from ..quantizers.quantizers_utils import get_module_from_name
+
+        # input_dict has {param_name: [tensor]} for the weight
+        value = list(input_dict.values())[0]
+        value = value[0] if isinstance(value, list) else value
+
+        # full_layer_name is e.g. "model.layers.0.self_attn.q_proj.weight"
+        module_name = full_layer_name.rsplit(".", 1)[0]
+        module, _ = get_module_from_name(model, full_layer_name)
+
+        # Load weight into the nn.Linear module
+        module.weight = torch.nn.Parameter(value, requires_grad=False)
+
+        # Get the quant_config that was set in _process_model_before_weight_loading
+        quant_config = getattr(module, "quant_config", None)
+        if quant_config is None:
+            # Module is skipped from quantization, just return the weight as-is
+            return {full_layer_name: value}
+
+        # Determine target device and compute dtype
+        target_device = value.device
+        compute_dtype = self.hf_quantizer.dtype
+
+        # Create HQQLinear from the nn.Linear
+        hqq_layer = HQQLinear(
+            module,
+            quant_config=quant_config,
+            compute_dtype=compute_dtype,
+            device=target_device,
+            del_orig=True,
+        )
+
+        if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+            hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+        if self.hf_quantizer.using_multi_gpu:
+            hqq_layer = self.hf_quantizer._patch_layer_for_multigpu(hqq_layer)
+
+        # Replace the module in the model
+        parent_module_name, _, child_name = module_name.rpartition(".")
+        parent_module = model.get_submodule(parent_module_name) if parent_module_name else model
+        setattr(parent_module, child_name, hqq_layer)
+
+        # Mark as loaded so it's not reported as missing
+        missing_keys = kwargs.get("missing_keys")
+        if missing_keys is not None:
+            missing_keys.discard(full_layer_name)
+
+        # Return empty dict so the loading code doesn't try to set params
+        return {}
+
+
+class HqqDeserialize:
+    """Deserialize HQQ pre-quantized weights into an HQQLinear module."""
+
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(
+        self,
+        input_dict,
+        full_layer_name=None,
+        model=None,
+        **kwargs,
+    ):
+        from hqq.core.quantize import HQQLinear
+
+        # Unwrap list values
+        state_dict = {}
+        for key, value in input_dict.items():
+            state_dict[key] = value[0] if isinstance(value, list) else value
+
+        # If W_q is not present, this is not an HQQ-quantized layer — pass through
+        if "W_q" not in state_dict:
+            return input_dict
+
+        # full_layer_name is e.g. "model.layers.0.self_attn.v_proj.weight"
+        # (target pattern "weight" appended to module path)
+        module_name = full_layer_name.rsplit(".", 1)[0]
+
+        parent_name, _, child_name = module_name.rpartition(".")
+        parent = model.get_submodule(parent_name) if parent_name else model
+
+        # Create empty HQQLinear
+        hqq_layer = HQQLinear(
+            None,
+            None,
+            compute_dtype=self.hf_quantizer.dtype or torch.float16,
+            device="cpu",
+            initialize=False,
+        )
+
+        # Make W_q an nn.Parameter as HQQ expects
+        if "W_q" in state_dict:
+            state_dict["W_q"] = torch.nn.Parameter(state_dict["W_q"], requires_grad=False)
+
+        hqq_layer.load_state_dict(state_dict)
+
+        if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+            hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+        if self.hf_quantizer.using_multi_gpu:
+            hqq_layer = self.hf_quantizer._patch_layer_for_multigpu(hqq_layer)
+
+        setattr(parent, child_name, hqq_layer)
+
+        # Mark weight and bias as loaded
+        missing_keys = kwargs.get("missing_keys")
+        if missing_keys is not None:
+            missing_keys.discard(full_layer_name)
+            # Also discard bias since HQQLinear handles it internally
+            bias_key = module_name + ".bias"
+            missing_keys.discard(bias_key)
+
+        return {}
diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py
index 70a343424aa8..a88f385fe9a6 100644
--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@@ -458,6 +458,15 @@ def decorator(cls):
 
         def new_init(self, *args, **kwargs):
             orig_init(self, *args, **kwargs)
+            # Skip attaching the kernelized submodule under DeepSpeed ZeRO-3: the coordinator traces
+            # the module graph at init time, and a child `nn.Module` that is not actually invoked
+            # during forward (e.g. when the model keeps calling the plain Python `apply_rotary_pos_emb`)
+            # breaks the parameter fetch trace and raises `IndexError: pop from an empty deque`.
+            # See https://github.com/huggingface/transformers/issues/45137
+            from .deepspeed import is_deepspeed_zero3_enabled
+
+            if is_deepspeed_zero3_enabled():
+                return
 
             # Register new function as non-submodule within the modules dict
             hidden_kernels = self.__dict__.setdefault("_hidden_kernels", {})
diff --git a/src/transformers/integrations/moe.py b/src/transformers/integrations/moe.py
index c8a8e87f3621..b30dd68bc0d4 100644
--- a/src/transformers/integrations/moe.py
+++ b/src/transformers/integrations/moe.py
@@ -15,6 +15,8 @@
 from collections.abc import Callable
 from functools import wraps
 
+from torch.distributed.tensor import DTensor
+
 from ..utils import logging
 from ..utils.generic import GeneralInterface
 from ..utils.import_utils import (
@@ -405,17 +407,20 @@ def grouped_mm_experts_forward(
     tokens_per_expert = torch.histc(histc_input, bins=self.num_experts, min=0, max=self.num_experts - 1)
     offsets = torch.cumsum(tokens_per_expert, dim=0, dtype=torch.int32)
 
+    def _local(p):
+        return p.to_local() if isinstance(p, DTensor) else p
+
     # Select expert weights and biases
     # NOTE: We keep all experts here and rely on offsets to target the active ones.
     # I have already implemented a version that only passes the active experts, but
     # to do so I had to use torch.unique which breaks the graph capture (data-dependent).
     # Also there were no speedup gains from it in my experiments, even in eager mode.
     if self.has_gate:
-        selected_weights = self.gate_up_proj
-        selected_biases = self.gate_up_proj_bias[expert_ids_g] if self.has_bias else None
+        selected_weights = _local(self.gate_up_proj)
+        selected_biases = _local(self.gate_up_proj_bias)[expert_ids_g] if self.has_bias else None
     else:
-        selected_weights = self.up_proj
-        selected_biases = self.up_proj_bias[expert_ids_g] if self.has_bias else None
+        selected_weights = _local(self.up_proj)
+        selected_biases = _local(self.up_proj_bias)[expert_ids_g] if self.has_bias else None
 
     # --- Up projection per expert (grouped) ---
     proj_out = _grouped_linear(
@@ -431,8 +436,8 @@ def grouped_mm_experts_forward(
         proj_out = self.act_fn(proj_out)  # (S, intermediate_dim)
 
     # Select down projection weights and biases
-    selected_weights = self.down_proj
-    selected_biases = self.down_proj_bias[expert_ids_g] if self.has_bias else None
+    selected_weights = _local(self.down_proj)
+    selected_biases = _local(self.down_proj_bias)[expert_ids_g] if self.has_bias else None
 
     # --- Down projection per expert (grouped) ---
     proj_out = _grouped_linear(
diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py
index 7b93e0a134b8..cad07bc2d3fc 100644
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@@ -34,6 +34,7 @@
     Transpose,
     WeightConverter,
     WeightRenaming,
+    rename_source_key,
 )
 from ..utils import (
     CONFIG_NAME,
@@ -47,7 +48,7 @@
     logging,
 )
 from ..utils.hub import DownloadKwargs
-from ..utils.loading_report import log_state_dict_report
+from ..utils.loading_report import LoadStateDictInfo, log_state_dict_report
 
 
 if is_torch_available():
@@ -506,6 +507,7 @@ def load_adapter(
                 `find_adapter_config_file` method.
         """
         from peft import PeftType
+        from peft.tuners.tuners_utils import BaseTunerLayer
         from peft.utils.save_and_load import _maybe_shard_state_dict_for_tp
 
         from ..modeling_utils import LoadStateDictConfig, _get_resolved_checkpoint_files, load_state_dict
@@ -618,45 +620,92 @@ def load_adapter(
 
         device_map = getattr(self, "hf_device_map", {"": self.device})
 
-        # If the model is tensor parallel, we handle the sharding of the state dict here since the logic in `self._load_pretrained_model`
-        # is not compatible with the way PEFT adapter should be sharded.
-        has_tp_adapters = False
-        for module in self.modules():
-            tp_info = getattr(module, "_tp_info", None)
-            if tp_info is not None:
-                has_tp_adapters = True
-                break
-
-        if has_tp_adapters:
+        def _resolve_adapter_state_dict():
+            # Materialize the adapter state dict from `adapter_state_dict` or `checkpoint_files`. Used by paths
+            # that bypass `self._load_pretrained_model` (which would otherwise read the files itself).
             all_pointer = set()
             if adapter_state_dict is not None:
-                merged_state_dict = adapter_state_dict
-            elif (
-                checkpoint_files is not None
-                and checkpoint_files[0].endswith(".safetensors")
-                and adapter_state_dict is None
-            ):
+                return adapter_state_dict
+            if checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors"):
                 merged_state_dict = {}
                 for file in checkpoint_files:
                     file_pointer = safe_open(file, framework="pt", device="cpu")
                     all_pointer.add(file_pointer)
                     for k in file_pointer.keys():
                         merged_state_dict[k] = file_pointer.get_tensor(k)
+                return merged_state_dict
             # Checkpoints are .bin
-            elif checkpoint_files is not None:
+            if checkpoint_files is not None:
                 merged_state_dict = {}
                 for ckpt_file in checkpoint_files:
                     merged_state_dict.update(load_state_dict(ckpt_file))
-            else:
-                raise ValueError("Neither a state dict nor checkpoint files were found.")
+                return merged_state_dict
+            raise ValueError("Neither a state dict nor checkpoint files were found.")
 
-            adapter_state_dict = merged_state_dict
+        def set_inference_mode(model):
+            model.eval()
+            for module in model.modules():
+                if isinstance(module, BaseTunerLayer):
+                    module.requires_grad_(False)
+
+        # If the model is tensor parallel, we handle the sharding of the state dict here since the logic in `self._load_pretrained_model`
+        # is not compatible with the way PEFT adapter should be sharded.
+        has_tp_adapters = False
+        for module in self.modules():
+            tp_info = getattr(module, "_tp_info", None)
+            if tp_info is not None:
+                has_tp_adapters = True
+                break
+
+        if has_tp_adapters:
+            adapter_state_dict = _resolve_adapter_state_dict()
 
             if any(not isinstance(v, torch.Tensor) for v in adapter_state_dict.values()):
                 raise ValueError("Expected all values in the adapter state dict to be tensors.")
 
             _maybe_shard_state_dict_for_tp(self, adapter_state_dict, adapter_name)
 
+        if hotswap:
+            # Bypass the standard loader and use PEFT's hotswap path so that LoRA weights
+            # whose rank differs from the existing adapter's are copied (and zero-padded)
+            # in place rather than triggering a "size mismatch" reinit, and so the LoRA
+            # scaling is updated alongside the weights.
+            from peft.utils.hotswap import check_hotswap_configs_compatible, hotswap_adapter_from_state_dict
+
+            adapter_state_dict = _resolve_adapter_state_dict()
+
+            # need to apply conversions manually as we don't use _load_pretrained_model
+            renamings = [r for r in peft_weight_conversions if isinstance(r, WeightRenaming)]
+            converters = [c for c in peft_weight_conversions if isinstance(c, WeightConverter)]
+            meta_state_dict = self.state_dict()
+            processed_state_dict = {}
+            for key, value in adapter_state_dict.items():
+                renamed_key, _ = rename_source_key(key, renamings, converters, self.base_model_prefix, meta_state_dict)
+                processed_state_dict[renamed_key] = value
+
+            check_hotswap_configs_compatible(self.peft_config[adapter_name], peft_config)
+            try:
+                hotswap_adapter_from_state_dict(
+                    model=self,
+                    state_dict=processed_state_dict,
+                    adapter_name=adapter_name,
+                    config=peft_config,
+                )
+            except Exception as e:
+                logger.error(f"Hotswapping {adapter_name} was unsuccessful with the following error:\n{e}")
+                raise
+
+            if peft_config.inference_mode:
+                set_inference_mode(self)
+
+            return LoadStateDictInfo(
+                missing_keys=set(),
+                unexpected_keys=set(),
+                mismatched_keys=set(),
+                error_msgs=[],
+                conversion_errors={},
+            )
+
         load_config = replace(
             load_config,
             pretrained_model_name_or_path=peft_model_id,
@@ -676,12 +725,7 @@ def load_adapter(
         )
 
         if peft_config.inference_mode:
-            from peft.tuners.tuners_utils import BaseTunerLayer
-
-            self.eval()
-            for module in self.modules():
-                if isinstance(module, BaseTunerLayer):
-                    module.requires_grad_(False)
+            set_inference_mode(self)
 
         adapter_key_markers = {adapter_name}
         if peft_config is not None and getattr(peft_config, "peft_type", None) is not None:
@@ -699,6 +743,16 @@ def is_adapter_key(key: str) -> bool:
             loading_info=loading_info,
             logger=logger,
         )
+
+        if self._prepare_peft_hotswap_kwargs is not None:
+            # Apply once, after the first adapter has been loaded but before the model is
+            # compiled, so the LoRA layers get padded up to target_rank and a later adapter
+            # with a different rank can be hot-swapped in without recompiling.
+            from peft.utils.hotswap import prepare_model_for_compiled_hotswap
+
+            prepare_model_for_compiled_hotswap(self, config=peft_config, **self._prepare_peft_hotswap_kwargs)
+            self._prepare_peft_hotswap_kwargs = None
+
         return loading_info
 
     def enable_peft_hotswap(
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
index bdf82e8490f0..02f677203856 100644
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@@ -29,6 +29,7 @@
     import torch
     import torch.distributed as dist
     from torch import nn
+    from torch.distributed.tensor import DTensor, Shard
 
     # Cache this result has it's a C FFI call which can be pretty time-consuming
     _torch_distributed_available = torch.distributed.is_available()
@@ -130,6 +131,17 @@ def _get_parameter_tp_plan(parameter_name: str, tp_plan: dict[str, str], is_weig
     return None
 
 
+def get_ep_sharded_param_names(model) -> list[str]:
+    """FQNs of parameters whose data is per-rank unique under EP sharding."""
+    if not getattr(model, "has_ep", False):
+        return []
+    return [
+        name
+        for name, _ in model.named_parameters()
+        if _get_parameter_tp_plan(parameter_name=name, tp_plan=model.tp_plan, is_weight=True) == "grouped_gemm"
+    ]
+
+
 # =============================================================================
 # Tensor Sharding Utilities
 # =============================================================================
@@ -685,6 +697,14 @@ def update_module_attributes(self, module: nn.Module):
         """
         pass
 
+    def post_shard_wrap(self, param: nn.Parameter) -> nn.Parameter:
+        """
+        Optional final wrap applied to a parameter after `shard_tensor` and before it is
+        attached to the module. Default is identity. Subclasses can override to e.g. wrap
+        the local shard as a DTensor.
+        """
+        return param
+
 
 class ColwiseParallel(TensorParallelLayer):
     """
@@ -1078,6 +1098,15 @@ def update_module_attributes(self, module: nn.Module):
         if hasattr(module, "num_experts"):
             module.num_experts = self.get_expected_sharded_shape((self.empty_param.shape[0],))[0]
 
+    def post_shard_wrap(self, param: nn.Parameter) -> nn.Parameter:
+        """
+        Wrap the EP-sharded local tensor as a DTensor on the TP/EP mesh. Without this, the
+        optimizer's foreach ops error with "mixed Tensor and DTensor" against the
+        FSDP-wrapped DTensor params on the rest of the model.
+        """
+        dt = DTensor.from_local(param.data, self.device_mesh, [Shard(0)], run_check=False)
+        return nn.Parameter(dt, requires_grad=param.requires_grad)
+
 
 class RouterParallel(TensorParallelLayer):
     """
@@ -1488,6 +1517,8 @@ def shard_and_distribute_module(
     # otherwise loading is crazy slow
     if not isinstance(param, torch.nn.Parameter):
         param = torch.nn.Parameter(param, requires_grad=empty_param.is_floating_point())
+    if current_shard_plan is not None:
+        param = tp_layer.post_shard_wrap(param)
     setattr(module_to_tp, param_type, param)
     if tp_layer is not None:
         tp_layer.update_module_attributes(module_to_tp)
diff --git a/src/transformers/loss/loss_utils.py b/src/transformers/loss/loss_utils.py
index 51564d299e55..7b984caa84c0 100644
--- a/src/transformers/loss/loss_utils.py
+++ b/src/transformers/loss/loss_utils.py
@@ -31,10 +31,21 @@ def fixed_cross_entropy(
     target: torch.Tensor,
     num_items_in_batch: torch.Tensor | None = None,
     ignore_index: int = -100,
-    **kwargs,
+    weight: torch.Tensor | None = None,
+    label_smoothing: float = 0.0,
+    **_kwargs,
 ) -> torch.Tensor:
     reduction = "sum" if num_items_in_batch is not None else "mean"
-    loss = nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction=reduction)
+
+    loss = nn.functional.cross_entropy(
+        source,
+        target,
+        ignore_index=ignore_index,
+        reduction=reduction,
+        label_smoothing=label_smoothing,
+        weight=weight,
+    )
+
     if reduction == "sum":
         # just in case users pass an int for num_items_in_batch, which could be the case for custom trainer
         if torch.is_tensor(num_items_in_batch):
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index b041964bbdfc..77813515fe39 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -929,8 +929,10 @@ def invert_attention_mask(self: "PreTrainedModel", encoder_attention_mask: Tenso
         """
         if encoder_attention_mask.dim() == 3:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-        if encoder_attention_mask.dim() == 2:
+        elif encoder_attention_mask.dim() == 2:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        else:
+            raise ValueError(f"Wrong shape for encoder_attention_mask (shape {encoder_attention_mask.shape})")
         # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
         # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
         # encoder_extended_attention_mask.transpose(-1, -2))
@@ -1293,18 +1295,21 @@ def __init_subclass__(cls, **kwargs):
         child_attribute = cls.__dict__.get("config_class", None)
 
         # defined in the class (this subclass or any parent class)
+        # `get_type_hints` resolves the down MRO until the first hit, so it will return `child_annotation`
+        # if the child has `cls.config` defined
         full_annotation = get_type_hints(cls).get("config", None)
         full_attribute = cls.config_class
 
-        # priority (child class_config -> child annotation -> global class_config -> global annotation)
+        # priority (child class_config -> child annotation -> child/global annotation -> global attribute)
+        # Important to keep this specific order for Python>=3.14
         if child_attribute is not None:
             cls.config_class = child_attribute
         elif child_annotation is not None:
             cls.config_class = child_annotation
-        elif full_attribute is not None:
-            cls.config_class = full_attribute
         elif full_annotation is not None:
             cls.config_class = full_annotation
+        elif full_attribute is not None:
+            cls.config_class = full_attribute
 
     def __init__(self, config: PreTrainedConfig, *inputs, **kwargs):
         super().__init__()
@@ -1395,12 +1400,18 @@ def post_init(self):
         self.init_weights()
         self._backward_compatibility_gradient_checkpointing()
 
+    @property
+    def has_ep(self) -> bool:
+        """Whether expert parallelism is enabled for this model."""
+        distributed_config = getattr(getattr(self, "config", None), "distributed_config", None)
+        return distributed_config is not None and getattr(distributed_config, "enable_expert_parallel", False)
+
     @property
     def tp_plan(self) -> dict[str, str]:
         """
         The full tp plan for the model's modules
         """
-        if hasattr(self.config, "distributed_config") and self.config.distributed_config.enable_expert_parallel:
+        if self.has_ep:
             return self._ep_plan
         return self._tp_plan
 
@@ -2735,6 +2746,7 @@ def resize_token_embeddings(
 
     def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean_resizing=True):
         old_embeddings = self.get_input_embeddings()
+        old_lm_head = copy.deepcopy(self.get_output_embeddings())
         new_embeddings = self._get_resized_embeddings(
             old_embeddings, new_num_tokens, pad_to_multiple_of, mean_resizing
         )
@@ -2757,8 +2769,7 @@ def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean
                 new_num_tokens = new_embeddings.weight.shape[0]
 
         # if word embeddings are not tied, make sure that lm head is resized as well
-        if self.get_output_embeddings() is not None:
-            old_lm_head = self.get_output_embeddings()
+        if old_lm_head is not None:
             if isinstance(old_lm_head, torch.nn.Embedding):
                 new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens, mean_resizing=mean_resizing)
             else:
@@ -2982,6 +2993,8 @@ def _get_resized_lm_head(
             old_num_tokens, old_lm_head_dim = (
                 old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
             )
+        old_num_tokens = getattr(old_lm_head, "out_features", old_num_tokens)
+        old_lm_head_dim = getattr(old_lm_head, "in_features", old_lm_head_dim)
 
         if old_num_tokens == new_num_tokens and not is_deepspeed_zero3_enabled():
             old_lm_head.out_features = new_num_tokens  # maybe weights are tied which doesn't update attr
@@ -3358,6 +3371,34 @@ def save_pretrained(
         if self._auto_class is not None:
             custom_object_save(self, save_directory, config=self.config)
 
+        # If tie_word_embeddings=True but weights have diverged (e.g. after PEFT merge_and_unload),
+        # auto-fix the config before saving, mirroring the load-side check in tie_weights().
+        if getattr(model_to_save.config, "tie_word_embeddings", False):
+            output_embeddings = model_to_save.get_output_embeddings()
+            if output_embeddings is not None:
+                out_w = getattr(output_embeddings, "weight", None)
+                in_w = getattr(model_to_save.get_input_embeddings(), "weight", None)
+                if out_w is not None and in_w is not None and out_w is not in_w:
+                    tied_keys = getattr(model_to_save, "_tied_weights_keys", None) or {}
+                    out_names = {n for n, p in model_to_save.named_parameters() if p is out_w}
+                    in_names = {n for n, p in model_to_save.named_parameters() if p is in_w}
+                    if any(
+                        (k in out_names and v in in_names) or (k in in_names and v in out_names)
+                        for k, v in tied_keys.items()
+                    ) and (
+                        out_w.shape != in_w.shape
+                        or (
+                            out_w.device == in_w.device
+                            and out_w.device.type != "meta"
+                            and not torch.equal(out_w, in_w)
+                        )
+                    ):
+                        model_to_save.config.tie_word_embeddings = False
+                        logger.warning(
+                            "Model config has `tie_word_embeddings=True` but input and output embedding "
+                            "weights have diverged. Saving config with `tie_word_embeddings=False`."
+                        )
+
         # Save the config
         if is_main_process:
             if not _hf_peft_config_loaded:
@@ -3671,14 +3712,27 @@ def float(self, *args):
 
     @classmethod
     def get_init_context(
-        cls, dtype: torch.dtype, is_quantized: bool, _is_ds_init_called: bool, allow_all_kernels: bool | None
+        cls,
+        dtype: torch.dtype,
+        is_quantized: bool,
+        _is_ds_init_called: bool,
+        allow_all_kernels: bool | None,
+        distributed_config=None,
     ):
         # Need to instantiate with correct dtype
         init_contexts = [local_torch_dtype(dtype, cls.__name__), init.no_tie_weights(), apply_patches()]
         # Needed as we cannot forward the `allow_all_kernels` arg in the model's __init__
         if allow_all_kernels:
             init_contexts.append(allow_all_hub_kernels())
-        if is_deepspeed_zero3_enabled():
+        _has_ep = distributed_config is not None and getattr(distributed_config, "enable_expert_parallel", False)
+        if _has_ep and is_deepspeed_zero3_enabled():
+            # EP + DeepSpeed: use meta device (same as the normal non-DS path).
+            # zero.Init is skipped because EP needs to shard experts via distribute_model()
+            # hooks, which are incompatible with ZeRO-3 lazy parameters.
+            # The standard weight loading path (not zero3) handles EP sharding via
+            # shard_and_distribute_module. deepspeed.initialize() wraps the result later.
+            init_contexts.extend([torch.device("meta"), init.meta_device_safe_creation_ops()])
+        elif is_deepspeed_zero3_enabled():
             import deepspeed
 
             # We cannot initialize the model on meta device with deepspeed when not quantized
@@ -4086,6 +4140,12 @@ def from_pretrained(
             download_kwargs_with_commit,
             **adapter_kwargs,
         )
+        # EP + DeepSpeed: clear device_map (set by initialize_tensor_parallelism) so the model
+        # loads on CPU first. distribute_model() handles GPU placement during EP sharding.
+        # Without this, device_map triggers accelerate's dispatch path which breaks shard loading.
+        _has_ep = distributed_config is not None and getattr(distributed_config, "enable_expert_parallel", False)
+        if _has_ep and is_deepspeed_zero3_enabled():
+            device_map = None
         device_map = check_and_set_device_map(device_map)  # warn, error and fix the device map
 
         user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class}
@@ -4194,7 +4254,9 @@ def from_pretrained(
 
             register_fusion_patches(cls, config, fusion_config)
 
-        model_init_context = cls.get_init_context(dtype, is_quantized, _is_ds_init_called, allow_all_kernels)
+        model_init_context = cls.get_init_context(
+            dtype, is_quantized, _is_ds_init_called, allow_all_kernels, distributed_config
+        )
 
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
         with ContextManagers(model_init_context):
@@ -4327,7 +4389,11 @@ def _load_pretrained_model(
 
         error_msgs = []
 
-        if is_deepspeed_zero3_enabled() and not is_quantized:
+        # EP + DeepSpeed: skip zero3 loading path. The model was created on meta device
+        # (not via zero.Init), so params are not zero3-partitioned. The standard loading
+        # path handles EP sharding via shard_and_distribute_module using the EP plan hooks
+        # registered by distribute_model(). deepspeed.initialize() wraps the result later.
+        if is_deepspeed_zero3_enabled() and not is_quantized and not model.has_ep:
             if state_dict is None:
                 merged_state_dict = {}
                 for ckpt_file in checkpoint_files:
@@ -4646,14 +4712,12 @@ def _move_missing_keys_from_meta_to_device(
         """
         is_quantized = hf_quantizer is not None
         # This is the only case where we do not initialize the model on meta device, so we don't have to do anything here
-        if is_deepspeed_zero3_enabled() and not is_quantized:
+        # Exception: EP + DeepSpeed uses meta device (not zero.Init), so it needs the standard move path.
+        if is_deepspeed_zero3_enabled() and not is_quantized and not self.has_ep:
             return
 
-        # In this case we need to move everything back
+        # Leave parameters on meta on non-rank-0 FSDP ranks (rank-0 broadcast overwrites them); only buffers need real placeholders.
         if is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized:
-            for key, param in self.named_parameters():
-                value = torch.zeros_like(param, device="cpu")
-                _load_parameter_into_model(self, key, value)
             for key, buffer in self.named_buffers():
                 value = torch.zeros_like(buffer, device="cpu")
                 _load_parameter_into_model(self, key, value)
@@ -4704,7 +4768,7 @@ def _initialize_missing_keys(self, is_quantized: bool) -> None:
             self._is_hf_initialized = True
 
         # This will only initialize submodules that are not marked as initialized by the line above.
-        if is_deepspeed_zero3_enabled() and not is_quantized:
+        if is_deepspeed_zero3_enabled() and not is_quantized and not self.has_ep:
             import deepspeed
 
             # keep_vars=True as we need the original tensors, so that the "_is_hf_initialized" is present on them
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 6162cb29559e..8f0b50f9b875 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -125,7 +125,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
@@ -137,7 +137,7 @@ def forward(
         embeddings = inputs_embeds + token_type_embeddings
 
         position_embeddings = self.position_embeddings(position_ids)
-        embeddings = embeddings + position_embeddings
+        embeddings = embeddings + position_embeddings.to(embeddings.device)
 
         embeddings = self.LayerNorm(embeddings)
         embeddings = self.dropout(embeddings)
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index d9ebfedb7ae9..4bbf0814c86e 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -36,6 +36,7 @@
 CONFIG_MAPPING_NAMES.update(
     {
         "EvollaModel": "EvollaConfig",
+        "ernie4_5_moe_vl": "Ernie4_5_VLMoeConfig",
         "mlcd": "MLCDVisionConfig",
         "vibevoice_acoustic_tokenizer_decoder": "VibeVoiceAcousticTokenizerDecoderConfig",
         "vibevoice_acoustic_tokenizer_encoder": "VibeVoiceAcousticTokenizerEncoderConfig",
@@ -49,6 +50,7 @@
 SPECIAL_MODEL_TYPE_TO_MODULE_NAME.update(
     {
         "EvollaModel": "evolla",
+        "ernie4_5_moe_vl": "ernie4_5_vl_moe",
         "vibevoice_acoustic_tokenizer_encoder": "vibevoice_acoustic_tokenizer",
         "vibevoice_acoustic_tokenizer_decoder": "vibevoice_acoustic_tokenizer",
     }
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 06998e9f02df..9e2d82c6f0f0 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1215,6 +1215,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
     [
         ("cohere_asr", "CohereAsrForConditionalGeneration"),
         ("dia", "DiaForConditionalGeneration"),
+        ("glmasr", "GlmAsrForConditionalGeneration"),
         ("granite_speech", "GraniteSpeechForConditionalGeneration"),
         ("kyutai_speech_to_text", "KyutaiSpeechToTextForConditionalGeneration"),
         ("moonshine", "MoonshineForConditionalGeneration"),
@@ -1686,6 +1687,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         # Model for Text-To-Waveform mapping
         ("bark", "BarkModel"),
         ("csm", "CsmForConditionalGeneration"),
+        ("dia", "DiaForConditionalGeneration"),
         ("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGan"),
         ("higgs_audio_v2", "HiggsAudioV2ForConditionalGeneration"),
         ("musicgen", "MusicgenForConditionalGeneration"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 6d0adc8473a6..49155ccadcc9 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -58,7 +58,6 @@
 logger = logging.get_logger(__name__)
 
 # V5: Simplified mapping - single tokenizer class per model type (always prefer tokenizers-based)
-REGISTERED_TOKENIZER_CLASSES: dict[str, type[Any]] = {}
 REGISTERED_FAST_ALIASES: dict[str, type[Any]] = {}
 
 TOKENIZER_MAPPING_NAMES = OrderedDict[str, str | None](
@@ -413,8 +412,10 @@ def tokenizer_class_from_name(class_name: str) -> type[Any] | None:
     if class_name in REGISTERED_FAST_ALIASES:
         return REGISTERED_FAST_ALIASES[class_name]
 
-    if class_name in REGISTERED_TOKENIZER_CLASSES:
-        return REGISTERED_TOKENIZER_CLASSES[class_name]
+    # User-registered classes take priority over built-ins
+    for tokenizer in TOKENIZER_MAPPING._extra_content.values():
+        if getattr(tokenizer, "__name__", None) == class_name:
+            return tokenizer
 
     if class_name == "TokenizersBackend":
         return TokenizersBackend
@@ -441,10 +442,6 @@ def tokenizer_class_from_name(class_name: str) -> type[Any] | None:
             except AttributeError:
                 continue
 
-    for tokenizer in TOKENIZER_MAPPING._extra_content.values():
-        if getattr(tokenizer, "__name__", None) == class_name:
-            return tokenizer
-
     # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
     # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
     # init and we return the proper dummy to get an appropriate error message.
@@ -715,13 +712,24 @@ def from_pretrained(
             and (TOKENIZER_MAPPING_NAMES.get(config_model_type).removesuffix("Fast"))
             != (tokenizer_config_class.removesuffix("Fast"))
         ):
-            tokenizer_class = tokenizer_class_from_name(tokenizer_config_class)
-            if tokenizer_class is not None and tokenizer_class.__name__ not in (
-                "TokenizersBackend",
-                "PythonBackend",
-                "PreTrainedTokenizerFast",
-            ):
-                return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+            mapped_tokenizer_class = TOKENIZER_MAPPING_NAMES.get(config_model_type)
+            # When `MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS` (or an explicit registration)
+            # pins a model_type to `TokenizersBackend`, the `tokenizer_class` declared in the
+            # Hub's `tokenizer_config.json` is known to be wrong (e.g. DeepSeek-V3/R1 which
+            # ship `tokenizer_class: LlamaTokenizerFast` over a ByteLevel `tokenizer.json`,
+            # but `LlamaTokenizerFast.__init__` would clobber the pre-tokenizer with
+            # Metaspace and silently break round-trip). Honor the override and skip the
+            # specialized class path entirely.
+            forced_tokenizers_backend = mapped_tokenizer_class == "TokenizersBackend"
+
+            if not forced_tokenizers_backend:
+                tokenizer_class = tokenizer_class_from_name(tokenizer_config_class)
+                if tokenizer_class is not None and tokenizer_class.__name__ not in (
+                    "TokenizersBackend",
+                    "PythonBackend",
+                    "PreTrainedTokenizerFast",
+                ):
+                    return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
             if TokenizersBackend is not None:
                 return TokenizersBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
@@ -864,10 +872,6 @@ def register(
             else:
                 raise ValueError("You need to pass a `tokenizer_class`")
 
-        for candidate in (slow_tokenizer_class, fast_tokenizer_class, tokenizer_class):
-            if candidate is not None:
-                REGISTERED_TOKENIZER_CLASSES[candidate.__name__] = candidate
-
         if slow_tokenizer_class is not None and fast_tokenizer_class is not None:
             REGISTERED_FAST_ALIASES[slow_tokenizer_class.__name__] = fast_tokenizer_class
 
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index 53053f644539..a95c8e9752be 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -127,9 +127,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/beit/image_processing_pil_beit.py b/src/transformers/models/beit/image_processing_pil_beit.py
index e3ccf12e909b..ff78dac96c40 100644
--- a/src/transformers/models/beit/image_processing_pil_beit.py
+++ b/src/transformers/models/beit/image_processing_pil_beit.py
@@ -120,10 +120,10 @@ def _preprocess_image_like_inputs(
 
     def reduce_label(self, image: np.ndarray) -> np.ndarray:
         """Reduce label values by 1, replacing 0 with 255."""
-        # Avoid using underflow conversion
-        image[image == 0] = 255
-        image = image - 1
-        image[image == 254] = 255
+        image = image.copy()
+        ignore_mask = (image == 0) | (image == 255)
+        image[ignore_mask] = 255
+        image[~ignore_mask] = image[~ignore_mask] - 1
         return image
 
     def _preprocess(
diff --git a/src/transformers/models/chmv2/image_processing_chmv2.py b/src/transformers/models/chmv2/image_processing_chmv2.py
index 3bb82b2dea53..067ba5898734 100644
--- a/src/transformers/models/chmv2/image_processing_chmv2.py
+++ b/src/transformers/models/chmv2/image_processing_chmv2.py
@@ -182,9 +182,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/chmv2/modular_chmv2.py b/src/transformers/models/chmv2/modular_chmv2.py
index f61c6687a351..5f44654876c6 100644
--- a/src/transformers/models/chmv2/modular_chmv2.py
+++ b/src/transformers/models/chmv2/modular_chmv2.py
@@ -150,6 +150,17 @@ class CHMv2ImageProcessor(DPTImageProcessor):
     image_std = [0.213, 0.156, 0.143]
     valid_kwargs = CHMv2ImageProcessorKwargs
 
+    def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
+        """Reduce label values by 1, replacing 0 with 255."""
+        for idx in range(len(labels)):
+            label = labels[idx]
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
+            labels[idx] = label
+        return labels
+
     def post_process_depth_estimation(
         self,
         outputs: "DepthEstimatorOutput",
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 2bca67e59a21..47eaf36e303a 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -401,6 +401,10 @@ class CLIPPreTrainedModel(PreTrainedModel):
         "hidden_states": CLIPEncoderLayer,
         "attentions": CLIPAttention,
     }
+    _keys_to_ignore_on_load_unexpected = [
+        r".*text_model\.embeddings\.position_ids",
+        r".*vision_model\.embeddings\.position_ids",
+    ]
 
     @torch.no_grad()
     def _init_weights(self, module):
diff --git a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
index 1192be10606d..42f4bf3117da 100644
--- a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
+++ b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
@@ -284,17 +284,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech.to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 2f10c81b38e1..a2c203d763a1 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1477,6 +1477,8 @@ def inverse_sigmoid(x, eps=1e-5):
     """
 )
 class ConditionalDetrForObjectDetection(ConditionalDetrPreTrainedModel):
+    base_model_prefix = "conditional_detr"
+
     def __init__(self, config: ConditionalDetrConfig):
         super().__init__(config)
 
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 384cc388cfd7..292917a4f2a1 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -1298,6 +1298,8 @@ def forward(self, x):
     """
 )
 class DetrForObjectDetection(DetrPreTrainedModel):
+    base_model_prefix = "detr"
+
     def __init__(self, config: DetrConfig):
         super().__init__(config)
 
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 6d157f6385c0..7969cead3f21 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -192,9 +192,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/dpt/image_processing_pil_dpt.py b/src/transformers/models/dpt/image_processing_pil_dpt.py
index 6f770cac4e5f..07e711769829 100644
--- a/src/transformers/models/dpt/image_processing_pil_dpt.py
+++ b/src/transformers/models/dpt/image_processing_pil_dpt.py
@@ -180,9 +180,10 @@ def _preprocess_image_like_inputs(
 
     def reduce_label(self, image: np.ndarray) -> np.ndarray:
         """Reduce label values by 1, replacing 0 with 255."""
-        image[image == 0] = 255
-        image = image - 1
-        image[image == 254] = 255
+        image = image.copy()
+        ignore_mask = (image == 0) | (image == 255)
+        image[ignore_mask] = 255
+        image[~ignore_mask] = image[~ignore_mask] - 1
         return image
 
     def resize(
diff --git a/src/transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py
index e4eea836f107..4d16d9061fd3 100644
--- a/src/transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py
@@ -67,8 +67,8 @@ class Ernie4_5_VLMoeTextConfig(PreTrainedConfig):
         Whether to use a bias in any of the projections including mlp and attention for example
     moe_k (`int`, *optional*, defaults to 6):
         Number of selected experts.
-    moe_num_experts (`int`, *optional*, defaults to 64):
-        Number of routed experts.
+    moe_num_experts (`int` or `list[int]`, *optional*, defaults to 64):
+        Number of routed experts. Can be a list to specify per-layer expert counts.
     moe_num_shared_experts (`int`, *optional*, defaults to 2):
         The number of experts that are shared for all MoE forwards.
     moe_norm_min (`float`, *optional*, defaults to 1e-12):
@@ -119,7 +119,7 @@ class Ernie4_5_VLMoeTextConfig(PreTrainedConfig):
     use_bias: bool | None = False
     moe_intermediate_size: list[int] | None = None
     moe_k: int | None = 6
-    moe_num_experts: int | None = 64
+    moe_num_experts: int | list[int] | None = 64
     moe_num_shared_experts: int | None = 2
     moe_norm_min: float | None = 1e-12
     output_router_logits: bool | None = False
diff --git a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
index ad47bc0508a3..5769a1272ed1 100644
--- a/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py
@@ -117,8 +117,8 @@ class Ernie4_5_VLMoeTextConfig(Ernie4_5_MoeConfig):
         Whether to use a bias in any of the projections including mlp and attention for example
     moe_k (`int`, *optional*, defaults to 6):
         Number of selected experts.
-    moe_num_experts (`int`, *optional*, defaults to 64):
-        Number of routed experts.
+    moe_num_experts (`int` or `list[int]`, *optional*, defaults to 64):
+        Number of routed experts. Can be a list to specify per-layer expert counts.
     moe_num_shared_experts (`int`, *optional*, defaults to 2):
         The number of experts that are shared for all MoE forwards.
     moe_norm_min (`float`, *optional*, defaults to 1e-12):
@@ -149,6 +149,7 @@ class Ernie4_5_VLMoeTextConfig(Ernie4_5_MoeConfig):
     pad_token_id: int | None = None
     eos_token_id: int | list[int] | None = None
     bos_token_id: int | None = None
+    moe_num_experts: int | list[int] | None = 64
     moe_layer_end_index = AttributeError()
     moe_layer_interval = AttributeError()
     moe_layer_start_index = AttributeError()
diff --git a/src/transformers/models/gemma4/convert_gemma4_weights.py b/src/transformers/models/gemma4/convert_gemma4_weights.py
index 53940445c7e6..07129c44bb79 100644
--- a/src/transformers/models/gemma4/convert_gemma4_weights.py
+++ b/src/transformers/models/gemma4/convert_gemma4_weights.py
@@ -63,10 +63,278 @@
 
 # ==== Internal Constants and Classes ====
 
+
+def _patch_template_for_openai_tool_role(template: str) -> str:
+    """Patch a Gemma4 chat template to support OpenAI-standard ``role: "tool"`` messages.
+
+    Applies three string replacements to the upstream template:
+
+    1. Injects a ``format_tool_response_block`` macro after the ``strip_thinking`` macro
+       to DRY up tool-response rendering.
+    2. Injects a ``last_user_idx`` pre-scan and replaces the entire message loop to:
+       - Skip ``role: "tool"`` messages in the outer loop (they are rendered proactively).
+       - Forward-scan consecutive ``role: "tool"`` messages from assistant turns that
+         have ``tool_calls``, rendering them as ``<|tool_response>`` blocks.
+       - Resolve ``tool_call_id`` back to function names from the originating ``tool_calls``.
+       - Handle ``content`` as both plain strings and OpenAI content-parts arrays.
+       - Suppress duplicate ``<|turn>model`` when consecutive assistant messages are
+         separated only by tool messages (multi-round tool-call loops).
+       - Render ``reasoning`` / ``reasoning_content`` fields as ``<|channel>thought`` blocks.
+    3. Preserves legacy ``tool_responses`` on assistant messages (Google/Gemma native format).
+    """
+    # --- Change 1: Inject format_tool_response_block macro after strip_thinking ---
+    old_after_strip = """{%- endmacro -%}\n\n{%- set ns = namespace(prev_message_type=None) -%}"""
+
+    new_after_strip = (
+        """{%- endmacro -%}\n"""
+        """\n"""
+        """{%- macro format_tool_response_block(tool_name, response) -%}\n"""
+        """    {{- '<|tool_response>' -}}\n"""
+        """    {%- if response is mapping -%}\n"""
+        """        {{- 'response:' + tool_name + '{' -}}\n"""
+        """        {%- for key, value in response | dictsort -%}\n"""
+        """            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n"""
+        """            {%- if not loop.last %},{% endif -%}\n"""
+        """        {%- endfor -%}\n"""
+        """        {{- '}' -}}\n"""
+        """    {%- else -%}\n"""
+        """        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}\n"""
+        """    {%- endif -%}\n"""
+        """    {{- '<tool_response|>' -}}\n"""
+        """{%- endmacro -%}\n"""
+        """\n"""
+        """{%- set ns = namespace(prev_message_type=None) -%}"""
+    )
+    template = template.replace(old_after_strip, new_after_strip)
+
+    # --- Change 2: Replace entire message loop with OpenAI-compatible version ---
+    # The old message loop is identical between E4B and 31B templates.
+    old_message_loop = (
+        """{#- Loop through messages -#}\n"""
+        """{%- for message in loop_messages -%}\n"""
+        """    {%- set ns.prev_message_type = None -%}\n"""
+        """    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n"""
+        """        {{- '<|turn>' + role + '\\n' }}\n"""
+        """\n"""
+        """            {%- if message['tool_calls'] -%}\n"""
+        """                {%- for tool_call in message['tool_calls'] -%}\n"""
+        """                    {%- set function = tool_call['function'] -%}\n"""
+        """                    {{- '<|tool_call>call:' + function['name'] + '{' -}}\n"""
+        """                    {%- if function['arguments'] is mapping -%}\n"""
+        """                        {%- set ns_args = namespace(found_first=false) -%}\n"""
+        """                        {%- for key, value in function['arguments'] | dictsort -%}\n"""
+        """                            {%- if ns_args.found_first %},{% endif -%}\n"""
+        """                            {%- set ns_args.found_first = true -%}\n"""
+        """                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n"""
+        """                        {%- endfor -%}\n"""
+        """                    {%- elif function['arguments'] is string -%}\n"""
+        """                        {{- function['arguments'] -}}\n"""
+        """                    {%- endif -%}\n"""
+        """                    {{- '}<tool_call|>' -}}\n"""
+        """                {%- endfor -%}\n"""
+        """                {%- set ns.prev_message_type = 'tool_call' -%}\n"""
+        """            {%- endif -%}\n"""
+        """\n"""
+        """            {%- if message['tool_responses'] -%}\n"""
+        """                {#- Tool Response handling -#}\n"""
+        """                {%- for tool_response in message['tool_responses'] -%}\n"""
+        """                    {{- '<|tool_response>' -}}\n"""
+        """                    {%- if tool_response['response'] is mapping -%}\n"""
+        """                        {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}\n"""
+        """                        {%- for key, value in tool_response['response'] | dictsort -%}\n"""
+        """                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n"""
+        """                            {%- if not loop.last %},{% endif -%}\n"""
+        """                        {%- endfor -%}\n"""
+        """                        {{- '}' -}}\n"""
+        """                    {%- else -%}\n"""
+        """                        {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}\n"""
+        """                    {%- endif -%}\n"""
+        """                    {{- '<tool_response|>' -}}\n"""
+        """                {%- endfor -%}\n"""
+        """                {%- set ns.prev_message_type = 'tool_response' -%}\n"""
+        """            {%- endif -%}\n"""
+        """\n"""
+        """            {%- if message['content'] is string -%}\n"""
+        """                {%- if role == 'model' -%}\n"""
+        """                    {{- strip_thinking(message['content']) -}}\n"""
+        """                {%- else -%}\n"""
+        """                    {{- message['content'] | trim -}}\n"""
+        """                {%- endif -%}\n"""
+        """            {%- elif message['content'] is sequence -%}\n"""
+        """                {%- for item in message['content'] -%}\n"""
+        """                    {%- if item['type'] == 'text' -%}\n"""
+        """                        {%- if role == 'model' -%}\n"""
+        """                            {{- strip_thinking(item['text']) -}}\n"""
+        """                        {%- else -%}\n"""
+        """                            {{- item['text'] | trim -}}\n"""
+        """                        {%- endif -%}\n"""
+        """                    {%- elif item['type'] == 'image' -%}\n"""
+        """                        {{- '\\n\\n<|image|>\\n\\n' -}}\n"""
+        """                        {%- set ns.prev_message_type = 'image' -%}\n"""
+        """                    {%- elif item['type'] == 'audio' -%}\n"""
+        """                        {{- '<|audio|>' -}}\n"""
+        """                        {%- set ns.prev_message_type = 'audio' -%}\n"""
+        """                    {%- elif item['type'] == 'video' -%}\n"""
+        """                        {{- '\\n\\n<|video|>\\n\\n' -}}\n"""
+        """                        {%- set ns.prev_message_type = 'video' -%}\n"""
+        """                    {%- endif -%}\n"""
+        """                {%- endfor -%}\n"""
+        """            {%- endif -%}\n"""
+        """\n"""
+        """        {%- if not (message['tool_responses'] and not message['content']) -%}\n"""
+        """            {{- '<turn|>\\n' -}}\n"""
+        """        {%- endif -%}\n"""
+        """{%- endfor -%}"""
+    )
+
+    new_message_loop = (
+        """{#- Pre-scan: find last user message index for reasoning guard -#}\n"""
+        """{%- set ns_turn = namespace(last_user_idx=-1) -%}\n"""
+        """{%- for i in range(loop_messages | length) -%}\n"""
+        """    {%- if loop_messages[i]['role'] == 'user' -%}\n"""
+        """        {%- set ns_turn.last_user_idx = i -%}\n"""
+        """    {%- endif -%}\n"""
+        """{%- endfor -%}\n"""
+        """\n"""
+        """{#- Loop through messages -#}\n"""
+        """{%- for message in loop_messages -%}\n"""
+        """    {%- if message['role'] != 'tool' -%}\n"""
+        """    {%- set ns.prev_message_type = None -%}\n"""
+        """    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}\n"""
+        """    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}\n"""
+        """    {%- set prev_nt = namespace(role=None, found=false) -%}\n"""
+        """    {%- if loop.index0 > 0 -%}\n"""
+        """        {%- for j in range(loop.index0 - 1, -1, -1) -%}\n"""
+        """            {%- if not prev_nt.found -%}\n"""
+        """                {%- if loop_messages[j]['role'] != 'tool' -%}\n"""
+        """                    {%- set prev_nt.role = loop_messages[j]['role'] -%}\n"""
+        """                    {%- set prev_nt.found = true -%}\n"""
+        """                {%- endif -%}\n"""
+        """            {%- endif -%}\n"""
+        """        {%- endfor -%}\n"""
+        """    {%- endif -%}\n"""
+        """    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}\n"""
+        """    {%- if not continue_same_model_turn -%}\n"""
+        """        {{- '<|turn>' + role + '\\n' }}\n"""
+        """    {%- endif -%}\n"""
+        """\n"""
+        """    {#- Render reasoning/reasoning_content as thinking channel -#}\n"""
+        """    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}\n"""
+        """    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}\n"""
+        """        {{- '<|channel>thought\\n' + thinking_text + '\\n<channel|>' -}}\n"""
+        """    {%- endif -%}\n"""
+        """\n"""
+        """            {%- if message['tool_calls'] -%}\n"""
+        """                {%- for tool_call in message['tool_calls'] -%}\n"""
+        """                    {%- set function = tool_call['function'] -%}\n"""
+        """                    {{- '<|tool_call>call:' + function['name'] + '{' -}}\n"""
+        """                    {%- if function['arguments'] is mapping -%}\n"""
+        """                        {%- set ns_args = namespace(found_first=false) -%}\n"""
+        """                        {%- for key, value in function['arguments'] | dictsort -%}\n"""
+        """                            {%- if ns_args.found_first %},{% endif -%}\n"""
+        """                            {%- set ns_args.found_first = true -%}\n"""
+        """                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}\n"""
+        """                        {%- endfor -%}\n"""
+        """                    {%- elif function['arguments'] is string -%}\n"""
+        """                        {{- function['arguments'] -}}\n"""
+        """                    {%- endif -%}\n"""
+        """                    {{- '}<tool_call|>' -}}\n"""
+        """                {%- endfor -%}\n"""
+        """                {%- set ns.prev_message_type = 'tool_call' -%}\n"""
+        """            {%- endif -%}\n"""
+        """\n"""
+        """            {%- set ns_tr_out = namespace(flag=false) -%}\n"""
+        """            {%- if message.get('tool_responses') -%}\n"""
+        """                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}\n"""
+        """                {%- for tool_response in message['tool_responses'] -%}\n"""
+        """                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}\n"""
+        """                    {%- set ns_tr_out.flag = true -%}\n"""
+        """                    {%- set ns.prev_message_type = 'tool_response' -%}\n"""
+        """                {%- endfor -%}\n"""
+        """            {%- elif message.get('tool_calls') -%}\n"""
+        """                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}\n"""
+        """                {%- set ns_tool_scan = namespace(stopped=false) -%}\n"""
+        """                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}\n"""
+        """                    {%- if ns_tool_scan.stopped -%}\n"""
+        """                    {%- elif loop_messages[k]['role'] != 'tool' -%}\n"""
+        """                        {%- set ns_tool_scan.stopped = true -%}\n"""
+        """                    {%- else -%}\n"""
+        """                        {%- set follow = loop_messages[k] -%}\n"""
+        """                        {#- Resolve tool_call_id to function name -#}\n"""
+        """                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}\n"""
+        """                        {%- for tc in message['tool_calls'] -%}\n"""
+        """                            {%- if tc.get('id') == follow.get('tool_call_id') -%}\n"""
+        """                                {%- set ns_tname.name = tc['function']['name'] -%}\n"""
+        """                            {%- endif -%}\n"""
+        """                        {%- endfor -%}\n"""
+        """                        {#- Handle content as string or content-parts array -#}\n"""
+        """                        {%- set tool_body = follow.get('content') -%}\n"""
+        """                        {%- if tool_body is string -%}\n"""
+        """                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n"""
+        """                        {%- elif tool_body is sequence and tool_body is not string -%}\n"""
+        """                            {%- set ns_txt = namespace(s='') -%}\n"""
+        """                            {%- for part in tool_body -%}\n"""
+        """                                {%- if part.get('type') == 'text' -%}\n"""
+        """                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}\n"""
+        """                                {%- endif -%}\n"""
+        """                            {%- endfor -%}\n"""
+        """                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}\n"""
+        """                        {%- else -%}\n"""
+        """                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}\n"""
+        """                        {%- endif -%}\n"""
+        """                        {%- set ns_tr_out.flag = true -%}\n"""
+        """                        {%- set ns.prev_message_type = 'tool_response' -%}\n"""
+        """                    {%- endif -%}\n"""
+        """                {%- endfor -%}\n"""
+        """            {%- endif -%}\n"""
+        """\n"""
+        """            {%- if message['content'] is string -%}\n"""
+        """                {%- if role == 'model' -%}\n"""
+        """                    {{- strip_thinking(message['content']) -}}\n"""
+        """                {%- else -%}\n"""
+        """                    {{- message['content'] | trim -}}\n"""
+        """                {%- endif -%}\n"""
+        """            {%- elif message['content'] is sequence -%}\n"""
+        """                {%- for item in message['content'] -%}\n"""
+        """                    {%- if item['type'] == 'text' -%}\n"""
+        """                        {%- if role == 'model' -%}\n"""
+        """                            {{- strip_thinking(item['text']) -}}\n"""
+        """                        {%- else -%}\n"""
+        """                            {{- item['text'] | trim -}}\n"""
+        """                        {%- endif -%}\n"""
+        """                    {%- elif item['type'] == 'image' -%}\n"""
+        """                        {{- '\\n\\n<|image|>\\n\\n' -}}\n"""
+        """                        {%- set ns.prev_message_type = 'image' -%}\n"""
+        """                    {%- elif item['type'] == 'audio' -%}\n"""
+        """                        {{- '<|audio|>' -}}\n"""
+        """                        {%- set ns.prev_message_type = 'audio' -%}\n"""
+        """                    {%- elif item['type'] == 'video' -%}\n"""
+        """                        {{- '\\n\\n<|video|>\\n\\n' -}}\n"""
+        """                        {%- set ns.prev_message_type = 'video' -%}\n"""
+        """                    {%- endif -%}\n"""
+        """                {%- endfor -%}\n"""
+        """            {%- endif -%}\n"""
+        """\n"""
+        """        {%- if not (ns_tr_out.flag and not message.get('content')) -%}\n"""
+        """            {{- '<turn|>\\n' -}}\n"""
+        """        {%- endif -%}\n"""
+        """    {%- endif -%}\n"""
+        """{%- endfor -%}"""
+    )
+    template = template.replace(old_message_loop, new_message_loop)
+
+    return template
+
+
 # The correct chat templates were already uploaded to those 2 repos, so download from there
 _CHAT_TEMPLATE = pathlib.Path(cached_file("gg-hf-gg/gemma-4-E4B-it", "chat_template.jinja")).read_text()
 _CHAT_TEMPLATE_LARGE = pathlib.Path(cached_file("gg-hf-gg/gemma-4-31B-it", "chat_template.jinja")).read_text()
 
+# Patch templates to support OpenAI-standard role: "tool" messages
+_CHAT_TEMPLATE = _patch_template_for_openai_tool_role(_CHAT_TEMPLATE)
+_CHAT_TEMPLATE_LARGE = _patch_template_for_openai_tool_role(_CHAT_TEMPLATE_LARGE)
+
+
 _RESPONSE_SCHEMA = {
     "type": "object",
     "properties": {
@@ -377,10 +645,10 @@ def convert_audio_encoder_weights(
                     converted_paths.append(f"{base}.ffw_layer_2.{param.removeprefix('clip_')}")
                     converted_weights.append(matrix)
                 elif path.endswith("ffn_layer1"):
-                    converted_paths.append(f"{base}.ffw_layer_1.linear.weight")
+                    converted_paths.append(f"{base}.ffw_layer_1.weight")
                     converted_weights.append(matrix.transpose())
                 elif path.endswith("ffn_layer2"):
-                    converted_paths.append(f"{base}.ffw_layer_2.linear.weight")
+                    converted_paths.append(f"{base}.ffw_layer_2.weight")
                     converted_weights.append(matrix.transpose())
                 elif path.endswith("post_layer_norm"):
                     converted_paths.append(f"{base}.post_layer_norm.weight")
@@ -398,10 +666,10 @@ def convert_audio_encoder_weights(
                     converted_paths.append(f"{base}.ffw_layer_2.{param.removeprefix('clip_')}")
                     converted_weights.append(matrix)
                 elif path.endswith("ffn_layer1"):
-                    converted_paths.append(f"{base}.ffw_layer_1.linear.weight")
+                    converted_paths.append(f"{base}.ffw_layer_1.weight")
                     converted_weights.append(matrix.transpose())
                 elif path.endswith("ffn_layer2"):
-                    converted_paths.append(f"{base}.ffw_layer_2.linear.weight")
+                    converted_paths.append(f"{base}.ffw_layer_2.weight")
                     converted_weights.append(matrix.transpose())
                 elif path.endswith("post_layer_norm"):
                     converted_paths.append(f"{base}.post_layer_norm.weight")
@@ -428,10 +696,10 @@ def convert_audio_encoder_weights(
                     converted_paths.append(f"{base}.depthwise_conv1d.weight")
                     converted_weights.append(matrix.transpose())
                 elif path.endswith("linear_end"):
-                    converted_paths.append(f"{base}.linear_end.linear.weight")
+                    converted_paths.append(f"{base}.linear_end.weight")
                     converted_weights.append(matrix.transpose())
                 elif path.endswith("linear_start"):
-                    converted_paths.append(f"{base}.linear_start.linear.weight")
+                    converted_paths.append(f"{base}.linear_start.weight")
                     converted_weights.append(matrix.transpose())
                 elif path.endswith("ln"):
                     converted_paths.append(f"{base}.pre_layer_norm.weight")
@@ -457,9 +725,9 @@ def convert_audio_encoder_weights(
                 if path.endswith("query_key_value_projection"):
                     converted_paths.extend(
                         [
-                            f"{base}.self_attn.q_proj.linear.weight",
-                            f"{base}.self_attn.k_proj.linear.weight",
-                            f"{base}.self_attn.v_proj.linear.weight",
+                            f"{base}.self_attn.q_proj.weight",
+                            f"{base}.self_attn.k_proj.weight",
+                            f"{base}.self_attn.v_proj.weight",
                         ]
                     )
                     converted_weights.extend(
@@ -472,7 +740,7 @@ def convert_audio_encoder_weights(
                     converted_paths.append(f"{base}.self_attn.relative_k_proj.weight")
                     converted_weights.append(matrix.reshape(config.hidden_size, config.hidden_size).transpose())
                 elif path.endswith("post"):
-                    converted_paths.append(f"{base}.self_attn.post.linear.weight")
+                    converted_paths.append(f"{base}.self_attn.post.weight")
                     converted_weights.append(matrix.transpose(2, 0, 1).reshape(config.hidden_size, config.hidden_size))
                 elif path.endswith("post_norm"):
                     converted_paths.append(f"{base}.norm_post_attn.weight")
@@ -626,7 +894,7 @@ def convert_vision_encoder_weights(
 
             if path.endswith("attn/attn_vec_einsum"):
                 # Shape: (12, 64, 768) -> reshape to (768, 768) for o_proj
-                converted_paths.append(f"{base_path}.self_attn.o_proj.linear.weight")
+                converted_paths.append(f"{base_path}.self_attn.o_proj.weight")
                 converted_weights.append(
                     matrix.transpose(2, 0, 1).reshape(config.hidden_size, config.num_attention_heads * config.head_dim)
                 )
@@ -634,8 +902,8 @@ def convert_vision_encoder_weights(
                 # Shape: (2, 12, 768, 64) -> split into k_proj and v_proj
                 converted_paths.extend(
                     [
-                        f"{base_path}.self_attn.k_proj.linear.weight",
-                        f"{base_path}.self_attn.v_proj.linear.weight",
+                        f"{base_path}.self_attn.k_proj.weight",
+                        f"{base_path}.self_attn.v_proj.weight",
                     ]
                 )
                 k_proj_weights, v_proj_weights = matrix.transpose(0, 2, 1, 3)
@@ -648,7 +916,7 @@ def convert_vision_encoder_weights(
                 )
             elif path.endswith("attn/q_einsum"):
                 # Shape: (12, 768, 64) -> reshape to (768, 768) for q_proj
-                converted_paths.append(f"{base_path}.self_attn.q_proj.linear.weight")
+                converted_paths.append(f"{base_path}.self_attn.q_proj.weight")
                 converted_weights.append(
                     matrix.transpose(1, 0, 2)
                     .reshape(config.hidden_size, config.num_attention_heads * config.head_dim)
@@ -658,15 +926,15 @@ def convert_vision_encoder_weights(
                 # Shape: (2, 3072, 768) -> split into gate_proj and up_proj
                 converted_paths.extend(
                     [
-                        f"{base_path}.mlp.gate_proj.linear.weight",
-                        f"{base_path}.mlp.up_proj.linear.weight",
+                        f"{base_path}.mlp.gate_proj.weight",
+                        f"{base_path}.mlp.up_proj.weight",
                     ]
                 )
                 gate_proj_weight, up_proj_weight = matrix
                 converted_weights.extend([gate_proj_weight, up_proj_weight])
             elif path.endswith("mlp/linear"):
                 # Shape: (3072, 768) -> transpose for down_proj
-                converted_paths.append(f"{base_path}.mlp.down_proj.linear.weight")
+                converted_paths.append(f"{base_path}.mlp.down_proj.weight")
                 converted_weights.append(matrix.transpose())
             elif path.endswith("post_attention_norm"):
                 converted_paths.append(f"{base_path}.post_attention_layernorm.weight")
@@ -1224,7 +1492,7 @@ def main(*args):
         pad_token_id=config.get_text_config().pad_token_id,
         bos_token_id=config.get_text_config().bos_token_id,
         eos_token_id=(
-            tokenizer.convert_tokens_to_ids([tokenizer.eos_token, tokenizer.eot_token, tokenizer.str_token])
+            tokenizer.convert_tokens_to_ids([tokenizer.eos_token, tokenizer.eot_token])
             if _INCLUDE_CHAT_TEMPLATE.value
             else config.get_text_config().eos_token_id
         ),
diff --git a/src/transformers/models/gemma4/modeling_gemma4.py b/src/transformers/models/gemma4/modeling_gemma4.py
index cdc4a6daeafc..ccdd41d97121 100644
--- a/src/transformers/models/gemma4/modeling_gemma4.py
+++ b/src/transformers/models/gemma4/modeling_gemma4.py
@@ -136,16 +136,21 @@ class Gemma4AudioModelOutput(BaseModelOutputWithPooling):
     attention_mask: torch.BoolTensor | None = None
 
 
-class Gemma4ClippableLinear(nn.Module):
+class Gemma4ClippableLinear(nn.Linear):
+    """Linear layer with optional input/output clamping.
+
+    Inherits from ``nn.Linear`` directly so that PEFT/LoRA can target these
+    layers via ``isinstance(module, nn.Linear)``.
+    """
+
     def __init__(
         self,
         config: Gemma4VisionConfig | Gemma4AudioConfig,
         in_features: int,
         out_features: int,
     ) -> None:
-        super().__init__()
+        super().__init__(in_features, out_features, bias=False)
         self.use_clipped_linears = config.use_clipped_linears
-        self.linear = nn.Linear(in_features, out_features, bias=False)
 
         if self.use_clipped_linears:
             self.register_buffer("input_min", torch.tensor(-float("inf")))
@@ -157,7 +162,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if self.use_clipped_linears:
             hidden_states = torch.clamp(hidden_states, self.input_min, self.input_max)
 
-        hidden_states = self.linear(hidden_states)
+        hidden_states = nn.Linear.forward(self, hidden_states)
 
         if self.use_clipped_linears:
             hidden_states = torch.clamp(hidden_states, self.output_min, self.output_max)
@@ -320,7 +325,7 @@ def forward(
         attn_output = attn_weights @ value_states.permute(0, 3, 1, 2, 4)
         attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, num_blocks * self.chunk_size, -1)
         attn_output = attn_output[:, :seq_length].contiguous()
-        attn_output = self.post(attn_output.to(dtype=self.post.linear.weight.dtype))
+        attn_output = self.post(attn_output.to(dtype=self.post.weight.dtype))
 
         return attn_output, attn_weights
 
@@ -400,7 +405,7 @@ def __init__(self, config: Gemma4AudioConfig):
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # This is needed to avoid any underflow/overflow issues when clipping
-        gradient_clipping = min(self.gradient_clipping, torch.finfo(self.ffw_layer_1.linear.weight.dtype).max)
+        gradient_clipping = min(self.gradient_clipping, torch.finfo(self.ffw_layer_1.weight.dtype).max)
 
         residual = hidden_states
         hidden_states = torch.clamp(hidden_states, -gradient_clipping, gradient_clipping)
@@ -483,7 +488,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.depthwise_conv1d(hidden_states.transpose(1, 2)).transpose(1, 2)
 
         # This is needed to avoid any underflow/overflow issues when clipping
-        gradient_clipping = min(self.gradient_clipping, torch.finfo(self.linear_start.linear.weight.dtype).max)
+        gradient_clipping = min(self.gradient_clipping, torch.finfo(self.linear_start.weight.dtype).max)
         hidden_states = torch.clamp(hidden_states, -gradient_clipping, gradient_clipping)
         hidden_states = self.conv_norm(hidden_states)
 
@@ -1442,7 +1447,7 @@ class Gemma4PreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["Gemma4TextDecoderLayer", "Gemma4VisionEncoderLayer", "Gemma4AudioLayer"]
     _skip_keys_device_placement = ["past_key_values", "shared_kv_states"]
-    _supports_flash_attn = True
+    _supports_flash_attn = False  # released checkpoints use head_dim=512, which is not supported yet by FA kernels
     _supports_sdpa = True
     _supports_flex_attn = True
 
@@ -1658,6 +1663,11 @@ def forward(
                 "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
             }
 
+        # Ensure a cache exists for KV sharing between layers, even when use_cache=False.
+        # This must happen after mask creation to avoid affecting causal mask computation.
+        if past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
         # embed positions
         hidden_states = inputs_embeds
         position_embeddings = {}
@@ -1686,7 +1696,7 @@ def forward(
 
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=past_key_values,
+            past_key_values=past_key_values if use_cache else None,
         )
 
     def get_per_layer_inputs(self, input_ids: torch.Tensor | None, inputs_embeds: torch.Tensor | None) -> torch.Tensor:
@@ -1941,7 +1951,8 @@ def forward(
                 (self.config.attention_context_left - 1, self.config.attention_context_right)
             ),
         )
-        attention_mask = self._convert_4d_mask_to_blocked_5d(attention_mask)
+        if attention_mask is not None:
+            attention_mask = self._convert_4d_mask_to_blocked_5d(attention_mask)
 
         for encoder_layer in self.layers[: self.config.num_hidden_layers]:
             hidden_states = encoder_layer(
diff --git a/src/transformers/models/gemma4/modular_gemma4.py b/src/transformers/models/gemma4/modular_gemma4.py
index 739870f2a177..a5ad1abd2580 100644
--- a/src/transformers/models/gemma4/modular_gemma4.py
+++ b/src/transformers/models/gemma4/modular_gemma4.py
@@ -99,16 +99,21 @@ class Gemma4AudioModelOutput(BaseModelOutputWithPooling):
     attention_mask: torch.BoolTensor | None = None
 
 
-class Gemma4ClippableLinear(nn.Module):
+class Gemma4ClippableLinear(nn.Linear):
+    """Linear layer with optional input/output clamping.
+
+    Inherits from ``nn.Linear`` directly so that PEFT/LoRA can target these
+    layers via ``isinstance(module, nn.Linear)``.
+    """
+
     def __init__(
         self,
         config: Gemma4VisionConfig | Gemma4AudioConfig,
         in_features: int,
         out_features: int,
     ) -> None:
-        super().__init__()
+        super().__init__(in_features, out_features, bias=False)
         self.use_clipped_linears = config.use_clipped_linears
-        self.linear = nn.Linear(in_features, out_features, bias=False)
 
         if self.use_clipped_linears:
             self.register_buffer("input_min", torch.tensor(-float("inf")))
@@ -120,7 +125,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if self.use_clipped_linears:
             hidden_states = torch.clamp(hidden_states, self.input_min, self.input_max)
 
-        hidden_states = self.linear(hidden_states)
+        hidden_states = nn.Linear.forward(self, hidden_states)
 
         if self.use_clipped_linears:
             hidden_states = torch.clamp(hidden_states, self.output_min, self.output_max)
@@ -266,7 +271,7 @@ def forward(
         attn_output = attn_weights @ value_states.permute(0, 3, 1, 2, 4)
         attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, num_blocks * self.chunk_size, -1)
         attn_output = attn_output[:, :seq_length].contiguous()
-        attn_output = self.post(attn_output.to(dtype=self.post.linear.weight.dtype))
+        attn_output = self.post(attn_output.to(dtype=self.post.weight.dtype))
 
         return attn_output, attn_weights
 
@@ -346,7 +351,7 @@ def __init__(self, config: Gemma4AudioConfig):
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # This is needed to avoid any underflow/overflow issues when clipping
-        gradient_clipping = min(self.gradient_clipping, torch.finfo(self.ffw_layer_1.linear.weight.dtype).max)
+        gradient_clipping = min(self.gradient_clipping, torch.finfo(self.ffw_layer_1.weight.dtype).max)
 
         residual = hidden_states
         hidden_states = torch.clamp(hidden_states, -gradient_clipping, gradient_clipping)
@@ -429,7 +434,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.depthwise_conv1d(hidden_states.transpose(1, 2)).transpose(1, 2)
 
         # This is needed to avoid any underflow/overflow issues when clipping
-        gradient_clipping = min(self.gradient_clipping, torch.finfo(self.linear_start.linear.weight.dtype).max)
+        gradient_clipping = min(self.gradient_clipping, torch.finfo(self.linear_start.weight.dtype).max)
         hidden_states = torch.clamp(hidden_states, -gradient_clipping, gradient_clipping)
         hidden_states = self.conv_norm(hidden_states)
 
@@ -1158,7 +1163,9 @@ class Gemma4TextScaledWordEmbedding(Gemma3TextScaledWordEmbedding):
 
 class Gemma4PreTrainedModel(Gemma3nPreTrainedModel):
     _no_split_modules = ["Gemma4TextDecoderLayer", "Gemma4VisionEncoderLayer", "Gemma4AudioLayer"]
+    _skip_keys_device_placement = ["past_key_values", "shared_kv_states"]
     input_modalities = ("image", "text", "video", "audio")
+    _supports_flash_attn = False  # released checkpoints use head_dim=512, which is not supported yet by FA kernels
     _can_record_outputs = None  # override
 
     @torch.no_grad()
@@ -1396,6 +1403,11 @@ def forward(
                 "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
             }
 
+        # Ensure a cache exists for KV sharing between layers, even when use_cache=False.
+        # This must happen after mask creation to avoid affecting causal mask computation.
+        if past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
         # embed positions
         hidden_states = inputs_embeds
         position_embeddings = {}
@@ -1424,7 +1436,7 @@ def forward(
 
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=past_key_values,
+            past_key_values=past_key_values if use_cache else None,
         )
 
 
@@ -1511,7 +1523,8 @@ def forward(
                 (self.config.attention_context_left - 1, self.config.attention_context_right)
             ),
         )
-        attention_mask = self._convert_4d_mask_to_blocked_5d(attention_mask)
+        if attention_mask is not None:
+            attention_mask = self._convert_4d_mask_to_blocked_5d(attention_mask)
 
         for encoder_layer in self.layers[: self.config.num_hidden_layers]:
             hidden_states = encoder_layer(
diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py
index 47c029a5bca9..66c993a94fdf 100644
--- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py
+++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py
@@ -23,9 +23,6 @@
 @strict
 class GptOssConfig(PreTrainedConfig):
     model_type = "gpt_oss"
-    attribute_map = {
-        "num_experts": "num_local_experts",
-    }
     default_theta = 150000.0
     base_model_pp_plan = {
         "embed_tokens": (["input_ids"], ["inputs_embeds"]),
diff --git a/src/transformers/models/lasr/feature_extraction_lasr.py b/src/transformers/models/lasr/feature_extraction_lasr.py
index 7cf1822ee40d..26cacd39b09a 100644
--- a/src/transformers/models/lasr/feature_extraction_lasr.py
+++ b/src/transformers/models/lasr/feature_extraction_lasr.py
@@ -232,17 +232,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 366e50d74ec2..5ba98d77a440 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -113,24 +113,35 @@ def __init__(
             }
 
         self._merges = merges or []
+        # Detect whether the merges use ByteLevel encoding (Ġ markers) or
+        # SentencePiece (▁ markers). ByteLevel-BPE tokenizers need the
+        # pre_tokenizer/decoder from tokenizer.json, not the Metaspace defaults.
+        is_byte_level = any("Ġ" in "".join(m) for m in self._merges[:20])
+        file_pre_tokenizer = kwargs.pop("pre_tokenizer", None)
+        file_decoder = kwargs.pop("decoder", None)
         self._tokenizer = Tokenizer(
             BPE(vocab=self._vocab, merges=self._merges, fuse_unk=True, byte_fallback=True, dropout=None)
         )
         self._tokenizer.normalizer = None
-        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
-            replacement="▁", prepend_scheme=_get_prepend_scheme(self.add_prefix_space, self), split=False
-        )
-
-        sequence = [
-            decoders.Replace("▁", " "),
-            decoders.ByteFallback(),
-            decoders.Fuse(),
-        ]
-
-        if self.add_prefix_space:
-            sequence += [decoders.Strip(content=" ", left=1)]
-
-        self._tokenizer.decoder = decoders.Sequence(sequence)
+        if is_byte_level and file_pre_tokenizer is not None:
+            self._tokenizer.pre_tokenizer = file_pre_tokenizer
+            if file_decoder is not None:
+                self._tokenizer.decoder = file_decoder
+        else:
+            self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
+                replacement="▁", prepend_scheme=_get_prepend_scheme(self.add_prefix_space, self), split=False
+            )
+
+            sequence = [
+                decoders.Replace("▁", " "),
+                decoders.ByteFallback(),
+                decoders.Fuse(),
+            ]
+
+            if self.add_prefix_space:
+                sequence += [decoders.Strip(content=" ", left=1)]
+
+            self._tokenizer.decoder = decoders.Sequence(sequence)
         self.use_default_system_prompt = use_default_system_prompt
         super().__init__(
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index d94c1912fbd9..2efd86398b2f 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -144,9 +144,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/mobilevit/image_processing_pil_mobilevit.py b/src/transformers/models/mobilevit/image_processing_pil_mobilevit.py
index 893e27fe4ccf..f6031a740eae 100644
--- a/src/transformers/models/mobilevit/image_processing_pil_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_pil_mobilevit.py
@@ -142,9 +142,10 @@ def _preprocess_image_like_inputs(
 
     def reduce_label(self, image: np.ndarray) -> np.ndarray:
         """Reduce label values by 1, replacing 0 with 255."""
-        image[image == 0] = 255
-        image = image - 1
-        image[image == 254] = 255
+        image = image.copy()
+        ignore_mask = (image == 0) | (image == 255)
+        image[ignore_mask] = 255
+        image[~ignore_mask] = image[~ignore_mask] - 1
         return image
 
     def flip_channel_order(self, image: np.ndarray) -> np.ndarray:
diff --git a/src/transformers/models/nemotron_h/modeling_nemotron_h.py b/src/transformers/models/nemotron_h/modeling_nemotron_h.py
index 93bd47f2c3f4..0c59c411af88 100644
--- a/src/transformers/models/nemotron_h/modeling_nemotron_h.py
+++ b/src/transformers/models/nemotron_h/modeling_nemotron_h.py
@@ -974,22 +974,27 @@ def _init_weights(self, module):
         """Initialize the weights."""
         super()._init_weights(module)
         if isinstance(module, NemotronHMamba2Mixer):
-            # Initialize A_log and D parameters
-            A = torch.arange(1, self.config.mamba_num_heads + 1)
-            init.copy_(module.A_log, torch.log(A))
-            init.ones_(module.D)
-
-            dt = torch.exp(
-                torch.rand(self.config.mamba_num_heads)
-                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
-                + math.log(self.config.time_step_min)
-            ).clamp(min=self.config.time_step_floor)
-
-            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-            inv_dt = dt + torch.log(-torch.expm1(-dt))
-            with torch.no_grad():
-                init.copy_(module.dt_bias, inv_dt)
-            module.dt_bias._no_reinit = True
+            # Only re-initialise params that were NOT loaded from a checkpoint.
+            # `_is_hf_initialized` is set by `from_pretrained` on each loaded
+            # parameter; without this guard a post-load safety pass of
+            # `_init_weights` would overwrite checkpoint values of
+            # A_log / D / dt_bias with fresh random draws.
+            if not getattr(module.A_log, "_is_hf_initialized", False):
+                A = torch.arange(1, self.config.mamba_num_heads + 1)
+                init.copy_(module.A_log, torch.log(A))
+            if not getattr(module.D, "_is_hf_initialized", False):
+                init.ones_(module.D)
+            if not getattr(module.dt_bias, "_is_hf_initialized", False):
+                dt = torch.exp(
+                    torch.rand(self.config.mamba_num_heads)
+                    * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                    + math.log(self.config.time_step_min)
+                ).clamp(min=self.config.time_step_floor)
+
+                # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+                inv_dt = dt + torch.log(-torch.expm1(-dt))
+                with torch.no_grad():
+                    init.copy_(module.dt_bias, inv_dt)
         elif isinstance(module, NemotronHTopkRouter):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             init.zeros_(module.e_score_correction_bias)
@@ -1014,10 +1019,12 @@ def _init_weights(self, module):
             # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
             for name, p in module.named_parameters():
                 if name == "out_proj.weight":
+                    # Skip checkpoint-loaded weights so a post-load safety
+                    # pass of `_init_weights` doesn't silently overwrite them.
+                    if getattr(p, "_is_hf_initialized", False):
+                        continue
                     # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                     # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                    # We need to reinit p since this code could be called multiple times
-                    # Having just p *= scale would repeatedly scale it down
                     init.kaiming_uniform_(p, a=math.sqrt(5))
                     with torch.no_grad():
                         p_new = p / math.sqrt(self.config.num_hidden_layers)
diff --git a/src/transformers/models/nemotron_h/modular_nemotron_h.py b/src/transformers/models/nemotron_h/modular_nemotron_h.py
index 803e5c638239..3cf46e97d097 100644
--- a/src/transformers/models/nemotron_h/modular_nemotron_h.py
+++ b/src/transformers/models/nemotron_h/modular_nemotron_h.py
@@ -327,22 +327,27 @@ def _init_weights(self, module):
         """Initialize the weights."""
         super()._init_weights(module)
         if isinstance(module, NemotronHMamba2Mixer):
-            # Initialize A_log and D parameters
-            A = torch.arange(1, self.config.mamba_num_heads + 1)
-            init.copy_(module.A_log, torch.log(A))
-            init.ones_(module.D)
-
-            dt = torch.exp(
-                torch.rand(self.config.mamba_num_heads)
-                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
-                + math.log(self.config.time_step_min)
-            ).clamp(min=self.config.time_step_floor)
-
-            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-            inv_dt = dt + torch.log(-torch.expm1(-dt))
-            with torch.no_grad():
-                init.copy_(module.dt_bias, inv_dt)
-            module.dt_bias._no_reinit = True
+            # Only re-initialise params that were NOT loaded from a checkpoint.
+            # `_is_hf_initialized` is set by `from_pretrained` on each loaded
+            # parameter; without this guard a post-load safety pass of
+            # `_init_weights` would overwrite checkpoint values of
+            # A_log / D / dt_bias with fresh random draws.
+            if not getattr(module.A_log, "_is_hf_initialized", False):
+                A = torch.arange(1, self.config.mamba_num_heads + 1)
+                init.copy_(module.A_log, torch.log(A))
+            if not getattr(module.D, "_is_hf_initialized", False):
+                init.ones_(module.D)
+            if not getattr(module.dt_bias, "_is_hf_initialized", False):
+                dt = torch.exp(
+                    torch.rand(self.config.mamba_num_heads)
+                    * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                    + math.log(self.config.time_step_min)
+                ).clamp(min=self.config.time_step_floor)
+
+                # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+                inv_dt = dt + torch.log(-torch.expm1(-dt))
+                with torch.no_grad():
+                    init.copy_(module.dt_bias, inv_dt)
         elif isinstance(module, NemotronHTopkRouter):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             init.zeros_(module.e_score_correction_bias)
@@ -367,10 +372,12 @@ def _init_weights(self, module):
             # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
             for name, p in module.named_parameters():
                 if name == "out_proj.weight":
+                    # Skip checkpoint-loaded weights so a post-load safety
+                    # pass of `_init_weights` doesn't silently overwrite them.
+                    if getattr(p, "_is_hf_initialized", False):
+                        continue
                     # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                     # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                    # We need to reinit p since this code could be called multiple times
-                    # Having just p *= scale would repeatedly scale it down
                     init.kaiming_uniform_(p, a=math.sqrt(5))
                     with torch.no_grad():
                         p_new = p / math.sqrt(self.config.num_hidden_layers)
diff --git a/src/transformers/models/parakeet/feature_extraction_parakeet.py b/src/transformers/models/parakeet/feature_extraction_parakeet.py
index c745d02c9629..95289cc00d99 100644
--- a/src/transformers/models/parakeet/feature_extraction_parakeet.py
+++ b/src/transformers/models/parakeet/feature_extraction_parakeet.py
@@ -217,17 +217,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
index 9ce98251e50e..3c3c1723a35a 100644
--- a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
@@ -145,17 +145,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/qwen3_5/configuration_qwen3_5.py b/src/transformers/models/qwen3_5/configuration_qwen3_5.py
index ae9eb8f86c6d..6548b7703ecb 100644
--- a/src/transformers/models/qwen3_5/configuration_qwen3_5.py
+++ b/src/transformers/models/qwen3_5/configuration_qwen3_5.py
@@ -121,6 +121,8 @@ class Qwen3_5VisionConfig(PreTrainedConfig):
         The output hidden size of the vision model.
     num_position_embeddings (`int`, *optional*, defaults to 2304):
         The maximum sequence length that this model might ever be used with
+    deepstack_visual_indexes (`list[int]`, *optional*, defaults to `[]`):
+        Indexed of layers for deepstack embeddings. Defaults to empty for Qwen3.5.
     """
 
     model_type = "qwen3_5_vision"
@@ -137,6 +139,8 @@ class Qwen3_5VisionConfig(PreTrainedConfig):
     temporal_patch_size: int | list[int] | tuple[int, int] = 2
     out_hidden_size: int = 3584
     num_position_embeddings: int = 2304
+
+    deepstack_visual_indexes: list[int] | tuple[int, ...] = ()
     initializer_range: float = 0.02
 
 
diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py
index 710b63a28dba..38c133113017 100644
--- a/src/transformers/models/qwen3_5/modular_qwen3_5.py
+++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py
@@ -129,9 +129,11 @@ class Qwen3_5VisionConfig(Qwen3VLVisionConfig):
         The output hidden size of the vision model.
     num_position_embeddings (`int`, *optional*, defaults to 2304):
         The maximum sequence length that this model might ever be used with
+    deepstack_visual_indexes (`list[int]`, *optional*, defaults to `[]`):
+        Indexed of layers for deepstack embeddings. Defaults to empty for Qwen3.5.
     """
 
-    deepstack_visual_indexes = AttributeError()
+    deepstack_visual_indexes: list[int] | tuple[int, ...] = ()
 
 
 @auto_docstring(checkpoint="Qwen/Qwen3.5-27B")
diff --git a/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py
index f6f9594e0d73..753753a3d4de 100644
--- a/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py
+++ b/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py
@@ -129,6 +129,8 @@ class Qwen3_5MoeVisionConfig(PreTrainedConfig):
         The output hidden size of the vision model.
     num_position_embeddings (`int`, *optional*, defaults to 2304):
         The maximum sequence length that this model might ever be used with
+    deepstack_visual_indexes (`list[int]`, *optional*, defaults to `[]`):
+        Indexed of layers for deepstack embeddings. Defaults to empty for Qwen3.5.
     """
 
     model_type = "qwen3_5_moe_vision"
@@ -145,6 +147,8 @@ class Qwen3_5MoeVisionConfig(PreTrainedConfig):
     temporal_patch_size: int | list[int] | tuple[int, int] = 2
     out_hidden_size: int = 3584
     num_position_embeddings: int = 2304
+
+    deepstack_visual_indexes: list[int] | tuple[int, ...] = ()
     initializer_range: float = 0.02
 
 
diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
index 7b6c8b5b1bd4..5ab6a049efb1 100644
--- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
@@ -147,9 +147,8 @@ def _get_feat_extract_output_lengths(input_lengths):
     Computes the output length of the convolutional layers and the output length of the audio encoder
     """
 
-    input_lengths_leave = input_lengths % 100
-    feat_lengths = (input_lengths_leave - 1) // 2 + 1
-    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+    feat_lengths = (input_lengths - 1) // 2 + 1
+    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1
     return output_lengths
 
 
diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
index 2c78ad930eba..c872e6e99752 100644
--- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
@@ -119,9 +119,8 @@ def _get_feat_extract_output_lengths(input_lengths):
     Computes the output length of the convolutional layers and the output length of the audio encoder
     """
 
-    input_lengths_leave = input_lengths % 100
-    feat_lengths = (input_lengths_leave - 1) // 2 + 1
-    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+    feat_lengths = (input_lengths - 1) // 2 + 1
+    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1
     return output_lengths
 
 
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index f8fa23ee31ba..2f58fb49b2b6 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -109,9 +109,8 @@ def _get_feat_extract_output_lengths(input_lengths):
     Computes the output length of the convolutional layers and the output length of the audio encoder
     """
 
-    input_lengths_leave = input_lengths % 100
-    feat_lengths = (input_lengths_leave - 1) // 2 + 1
-    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+    feat_lengths = (input_lengths - 1) // 2 + 1
+    output_lengths = ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1
     return output_lengths
 
 
diff --git a/src/transformers/models/segformer/image_processing_pil_segformer.py b/src/transformers/models/segformer/image_processing_pil_segformer.py
index f1d0bb0f627b..771d70a6365c 100644
--- a/src/transformers/models/segformer/image_processing_pil_segformer.py
+++ b/src/transformers/models/segformer/image_processing_pil_segformer.py
@@ -138,10 +138,10 @@ def _preprocess_image_like_inputs(
 
     def reduce_label(self, image: np.ndarray) -> np.ndarray:
         """Reduce label values by 1, replacing 0 with 255."""
-        # Avoid using underflow conversion
-        image[image == 0] = 255
-        image = image - 1
-        image[image == 254] = 255
+        image = image.copy()
+        ignore_mask = (image == 0) | (image == 255)
+        image[ignore_mask] = 255
+        image[~ignore_mask] = image[~ignore_mask] - 1
         return image
 
     def _preprocess(
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index efc8c312953e..616895716a3f 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -138,9 +138,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 4f4124961a92..e90af18034bc 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -96,14 +96,14 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tens
 
         # Apply Softmax and cast back to the original `dtype`
         router_probs = nn.functional.softmax(router_logits, dim=-1, dtype=self.dtype).to(self.input_dtype)
-        router_logits, expert_index = torch.max(router_probs, dim=-1, keepdim=True)
+        router_max_probs, expert_index = torch.max(router_probs, dim=-1, keepdim=True)
         expert_index = torch.nn.functional.one_hot(expert_index, num_classes=self.num_experts)
         token_priority = torch.cumsum(expert_index, dim=-2)
         # mask if the token routed to the expert will overflow
         expert_capacity_mask = token_priority <= self.expert_capacity
         expert_index = expert_index * expert_capacity_mask
-        router_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1)
-        return router_probs, expert_index, router_logits
+        router_max_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1)
+        return router_max_probs, expert_index, router_logits
 
 
 class SwitchTransformersLayerNorm(nn.Module):
diff --git a/src/transformers/models/switch_transformers/modular_switch_transformers.py b/src/transformers/models/switch_transformers/modular_switch_transformers.py
index 5c0f253cfb78..eec222c16a69 100644
--- a/src/transformers/models/switch_transformers/modular_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modular_switch_transformers.py
@@ -163,14 +163,14 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tens
 
         # Apply Softmax and cast back to the original `dtype`
         router_probs = nn.functional.softmax(router_logits, dim=-1, dtype=self.dtype).to(self.input_dtype)
-        router_logits, expert_index = torch.max(router_probs, dim=-1, keepdim=True)
+        router_max_probs, expert_index = torch.max(router_probs, dim=-1, keepdim=True)
         expert_index = torch.nn.functional.one_hot(expert_index, num_classes=self.num_experts)
         token_priority = torch.cumsum(expert_index, dim=-2)
         # mask if the token routed to the expert will overflow
         expert_capacity_mask = token_priority <= self.expert_capacity
         expert_index = expert_index * expert_capacity_mask
-        router_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1)
-        return router_probs, expert_index, router_logits
+        router_max_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1)
+        return router_max_probs, expert_index, router_logits
 
 
 class SwitchTransformersLayerNorm(T5LayerNorm):
diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
index 5757a490692a..b28b4bdf4c9d 100644
--- a/src/transformers/models/voxtral/processing_voxtral.py
+++ b/src/transformers/models/voxtral/processing_voxtral.py
@@ -168,6 +168,23 @@ def apply_chat_template(
             is_batched = False
             conversations = [conversation]
 
+        # Resolve chat_template if not provided
+        if chat_template is None:
+            if isinstance(self.chat_template, dict) and "default" in self.chat_template:
+                chat_template = self.chat_template["default"]
+            elif isinstance(self.chat_template, dict):
+                raise ValueError(
+                    'The processor has multiple chat templates but none of them are named "default". You need to specify'
+                    " which one to use by passing the `chat_template` argument. Available templates are: "
+                    f"{', '.join(self.chat_template.keys())}"
+                )
+            elif self.chat_template is not None:
+                chat_template = self.chat_template
+            else:
+                raise ValueError(
+                    "Cannot use apply_chat_template because this processor does not have a chat template."
+                )
+
         # Users might still be passing processing kwargs in `**kwargs` so we need to filter
         # out additional kwargs that the template expects via Jinja2 template introspection
         # We strip unrelated kwargs to avoid passing unrecognized kwargs to `_merge_kwargs`.
diff --git a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
index 58355f3c0d7c..f13006f6b198 100644
--- a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
@@ -203,17 +203,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 1f9c9843d34a..3bc1cb4a82ab 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1060,11 +1060,15 @@ def generate_with_fallback(
             new_decoder_input_ids = []
             new_decoder_attention_mask = []
 
+            eos_token_id = generation_config.eos_token_id
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+
             for i, seek_sequence in enumerate(seek_sequences):
                 # remove all padding tokens, except for the eos token
                 if seek_sequence[-1] == generation_config.pad_token_id:
                     num_paddings = (seek_sequence == generation_config.pad_token_id).sum()
-                    if generation_config.pad_token_id == generation_config.eos_token_id:
+                    if eos_token_id is not None and generation_config.pad_token_id in eos_token_id:
                         # we do not remove the eos token id since it is needed for avg logprob calculation in _need_fallback
                         num_paddings -= 1
                     if num_paddings != 0:
@@ -1082,7 +1086,7 @@ def generate_with_fallback(
                 )
 
                 # remove eos token
-                if seek_sequence[-1] == generation_config.eos_token_id:
+                if eos_token_id is not None and seek_sequence[-1].item() in eos_token_id:
                     seek_sequence = seek_sequence[:-1]
 
                 seek_sequence_list[fallback_index_map[i]] = seek_sequence
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index c0cbc7111f4b..13d4a1ab338b 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -422,7 +422,12 @@ class XCLIPPreTrainedModel(PreTrainedModel):
     config: XCLIPConfig
     base_model_prefix = "x_clip"
     input_modalities = ("image", "text")
-    _no_split_modules = ["XCLIPTextEmbeddings", "XCLIPEncoderLayer", "XCLIPVisionEmbeddings"]
+    _no_split_modules = [
+        "XCLIPTextEmbeddings",
+        "XCLIPEncoderLayer",
+        "XCLIPVisionEmbeddings",
+        "XCLIPVisionEncoderLayer",
+    ]
 
     supports_gradient_checkpointing = True
     _supports_sdpa = True
diff --git a/src/transformers/models/x_clip/modular_x_clip.py b/src/transformers/models/x_clip/modular_x_clip.py
index 9d76e97430d1..5980e8b68e07 100644
--- a/src/transformers/models/x_clip/modular_x_clip.py
+++ b/src/transformers/models/x_clip/modular_x_clip.py
@@ -173,6 +173,12 @@ def forward(
 class XCLIPPreTrainedModel(CLIPPreTrainedModel):
     config: XCLIPConfig
     base_model_prefix = "x_clip"
+    _no_split_modules = [
+        "XCLIPTextEmbeddings",
+        "XCLIPEncoderLayer",
+        "XCLIPVisionEmbeddings",
+        "XCLIPVisionEncoderLayer",
+    ]
     _can_record_outputs = {
         "hidden_states": [XCLIPEncoderLayer, XCLIPVisionEncoderLayer],
         "attentions": OutputRecorder(XCLIPAttention, layer_name="self_attn", index=1),
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index d6b9fcf32736..57ed01f99506 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -25,5 +25,12 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.video_processor = self.image_processor
 
+    def __call__(self, images=None, text=None, videos=None, **kwargs):
+        # X-CLIP uses the image_processor for video frames. Map videos to images
+        # so the base class processes them through image_processor.
+        if videos is not None and images is None:
+            images = videos
+        return super().__call__(images=images, text=text, **kwargs)
+
 
 __all__ = ["XCLIPProcessor"]
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index 0a4fba996d7d..eaa2fdfa3a83 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -159,21 +159,23 @@ def unnormalize(bbox):
         else:
             # This is a regular ForObjectDetectionModel
             raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
-            raw_annotation = raw_annotations[0]
-            scores = raw_annotation["scores"]
-            labels = raw_annotation["labels"]
-            boxes = raw_annotation["boxes"]
-
-            raw_annotation["scores"] = scores.tolist()
-            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
-            raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
-
-            # {"scores": [...], ...} --> [{"score":x, ...}, ...]
-            keys = ["score", "label", "box"]
-            annotation = [
-                dict(zip(keys, vals))
-                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
-            ]
+            annotation = []
+            for raw_annotation in raw_annotations:
+                scores = raw_annotation["scores"]
+                labels = raw_annotation["labels"]
+                boxes = raw_annotation["boxes"]
+
+                raw_annotation["scores"] = scores.tolist()
+                raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
+                raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
+
+                keys = ["score", "label", "box"]
+                annotation.append(
+                    [
+                        dict(zip(keys, vals))
+                        for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
+                    ]
+                )
 
         return annotation
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index bb1344a43dcf..874a12a2039a 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -22,6 +22,7 @@
 import os
 import sys
 import typing
+from collections import Counter
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Annotated, Any, Literal, TypedDict, TypeVar, Union
@@ -682,9 +683,8 @@ def __call__(
             "feature_extractor": (audio, "audio_kwargs"),
         }
         outputs = {}
-        for attribute_name in self.get_attributes():
+        for attribute_name, (input_data, input_kwargs) in attribute_to_kwargs.items():
             attribute = getattr(self, attribute_name, None)
-            input_data, input_kwargs = attribute_to_kwargs[attribute_name]
             if input_data is not None and attribute is not None:
                 attribute_output = attribute(input_data, **kwargs[input_kwargs])
                 outputs.update(attribute_output)
@@ -1424,11 +1424,32 @@ def from_pretrained(
         if token is not None:
             kwargs["token"] = token
 
+        prebuilt = cls._pop_prebuilt_subprocessors(kwargs)
+
         # Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors
         processor_dict, instantiation_kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
-        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs)
+        args = cls._get_arguments_from_pretrained(
+            pretrained_model_name_or_path, processor_dict, _prebuilt=prebuilt, **kwargs
+        )
         return cls.from_args_and_dict(args, processor_dict, **instantiation_kwargs)
 
+    @classmethod
+    def _pop_prebuilt_subprocessors(cls, kwargs: dict) -> dict:
+        """Pop pre-built sub-processors from `kwargs` by exact attribute name, or by modality
+        alias (e.g. `tokenizer=` → `bpe_tokenizer`) when that modality is unambiguous.
+        """
+        sub_processors = cls.get_attributes()
+        modality_counts = Counter(_get_modality_for_attribute(s) for s in sub_processors)
+        prebuilt = {}
+        for sub_processor_type in sub_processors:
+            modality = _get_modality_for_attribute(sub_processor_type)
+            instance = kwargs.pop(sub_processor_type, None)
+            if instance is None and modality != sub_processor_type and modality_counts[modality] == 1:
+                instance = kwargs.pop(modality, None)
+            if instance is not None:
+                prebuilt[sub_processor_type] = instance
+        return prebuilt
+
     @classmethod
     def get_attributes(cls):
         args_in_init = inspect.signature(cls.__init__).parameters.keys()
@@ -1499,7 +1520,9 @@ def _load_tokenizer_from_pretrained(
         return tokenizer
 
     @classmethod
-    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs):
+    def _get_arguments_from_pretrained(
+        cls, pretrained_model_name_or_path, processor_dict=None, *, _prebuilt=None, **kwargs
+    ):
         """
         Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers,
         and feature extractors. This method inspects the processor's `__init__` signature to identify parameters
@@ -1517,15 +1540,21 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor
             pretrained_model_name_or_path: Path or model id to load from.
             processor_dict: Optional dict containing processor config (from processor_config.json).
                 Required when loading additional non-tokenizer sub-processors.
+            _prebuilt: Optional `{attribute: instance}` dict of pre-built sub-processors that skip loading.
         """
         args = []
         processor_dict = processor_dict if processor_dict is not None else {}
         # Remove subfolder from kwargs to avoid duplicate keyword arguments
         subfolder = kwargs.pop("subfolder", "")
 
+        prebuilt = _prebuilt or {}
+
         # get args from processor init signature
         sub_processors = cls.get_attributes()
         for sub_processor_type in sub_processors:
+            if sub_processor_type in prebuilt:
+                args.append(prebuilt[sub_processor_type])
+                continue
             modality = _get_modality_for_attribute(sub_processor_type)
             is_primary = sub_processor_type == modality
 
@@ -1789,6 +1818,14 @@ def apply_chat_template(
             is_batched = False
             conversations = [conversation]
 
+        # Normalize: drop `content` from assistant messages when it is None.
+        # Some APIs (e.g. OpenAI) return content=None for tool-call-only messages, but many chat templates
+        # crash or produce wrong output (e.g. rendering literal "None") when they encounter it.
+        conversations = [
+            [{k: v for k, v in msg.items() if k != "content" or v is not None} for msg in conversation]
+            for conversation in conversations
+        ]
+
         # Normalize OpenAI-style "image_url" content blocks to HuggingFace-style "image" blocks
         # OpenAI format: {"type": "image_url", "image_url": {"url": "..."}}
         # HuggingFace format: {"type": "image", "url": "..."}
diff --git a/src/transformers/quantizers/quantizer_hqq.py b/src/transformers/quantizers/quantizer_hqq.py
index 05dce3d996a0..43238e99e7e6 100755
--- a/src/transformers/quantizers/quantizer_hqq.py
+++ b/src/transformers/quantizers/quantizer_hqq.py
@@ -59,10 +59,16 @@ def __init__(self, quantization_config, **kwargs):
             )
         super().__init__(quantization_config, **kwargs)
         self.dtype = None
+        self.device_map = None
         self.using_multi_gpu = False
         # Keys that are serialized specifically by hqq
         self.hqq_keys = HQQLinear(None, None).state_dict_keys() - {"bias"}
 
+    def update_dtype(self, dtype):
+        if dtype is not None:
+            self.dtype = dtype
+        return dtype
+
     def validate_environment(self, *args, **kwargs):
         if self.dtype is None:
             if "dtype" in kwargs:
@@ -72,6 +78,7 @@ def validate_environment(self, *args, **kwargs):
                 logger.info("Setting dtype to torch.float32 as the default value since it was not specified.")
 
         device_map = kwargs.get("device_map")
+        self.device_map = device_map
         if isinstance(device_map, dict):
             if "cpu" in device_map.values() or "disk" in device_map.values():
                 raise ValueError(
@@ -144,10 +151,16 @@ def validate_environment(self, *args, **kwargs):
     #     return list(new_keys)
 
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
-        module, _ = get_module_from_name(model, param_name)
-        # Since we do not prepare the modules in advance, we need every param of the Linear layer to go through
-        # `create_quantized_param`, even when `self.is_quantized == True`
-        return isinstance(module, torch.nn.Linear)
+        module, tensor_name = get_module_from_name(model, param_name)
+        return isinstance(module, torch.nn.Linear) and tensor_name == "weight"
+
+    def get_quantize_ops(self):
+        from ..integrations.hqq import HqqQuantize
+
+        return HqqQuantize(self)
+
+    def get_weight_conversions(self):
+        return []
 
     # TODO: to remove
     # def create_quantized_param(
@@ -232,6 +245,47 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **
 
     #         setattr(parent_module, node, hqq_layer)
 
+    def _setup_missing_key_filters(self, model, checkpoint_files):
+        """Scan checkpoint files to find HQQ-quantized modules.
+
+        For those modules:
+        1. Suppress their .weight missing key warnings in the load report.
+        2. Replace their weight parameter with a scalar meta tensor so that
+           ``_move_missing_keys_from_meta_to_device`` does not allocate
+           full-size fp16 tensors on GPU (which would cause OOM).
+        """
+        import re
+
+        from safetensors import safe_open
+
+        quantized_modules = set()
+        for ckpt_file in checkpoint_files:
+            if ckpt_file.endswith(".safetensors"):
+                with safe_open(ckpt_file, framework="pt") as f:
+                    for k in f.keys():
+                        if k.endswith(".W_q"):
+                            quantized_modules.add(k[: -len(".W_q")])
+            else:
+                state_dict = torch.load(ckpt_file, map_location="cpu", weights_only=True)
+                for k in state_dict:
+                    if k.endswith(".W_q"):
+                        quantized_modules.add(k[: -len(".W_q")])
+
+        if quantized_modules:
+            # Build regex that matches only .weight keys of quantized modules
+            escaped = [re.escape(m) + r"\.weight" for m in quantized_modules]
+            existing = model._keys_to_ignore_on_load_missing or []
+            model._keys_to_ignore_on_load_missing = existing + escaped
+
+            # Replace weight params with scalar meta tensors to avoid GPU allocation
+            for module_name in quantized_modules:
+                try:
+                    module = model.get_submodule(module_name)
+                except AttributeError:
+                    continue
+                if hasattr(module, "weight") and module.weight is not None:
+                    module.weight = torch.nn.Parameter(torch.empty(0, device="meta"), requires_grad=False)
+
     def _patch_layer_for_multigpu(self, hqq_layer):
         def forward_with_device(self, x):
             out = torch.matmul(x.to(self.device), self.dequantize().t())
@@ -245,17 +299,133 @@ def forward_with_device(self, x):
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
+        checkpoint_files=None,
         **kwargs,
     ):
-        # Add the corresponding quant_config to each valid module. This allows us to do the actual nn.Linear -> HQQLinear conversion in create_quantized_param().
-        # prepare_for_hqq_linear() also sets the right quantization config inside the model (model.config.quantization_config) and the layers (hqq_layer.quant_config)
-        model = prepare_for_hqq_linear(model, quantization_config=self.quantization_config)
+        if self.pre_quantized:
+            # Store checkpoint files for loading in _process_model_after_weight_loading
+            self._checkpoint_files = checkpoint_files
+
+            # Suppress noisy load report: HQQ checkpoint keys (W_q, scale, etc.) are
+            # "unexpected" and nn.Linear .weight keys are "missing" from the standard
+            # loading perspective, but _load_hqq_from_checkpoint handles them.
+            hqq_keys = HQQLinear(None, None).state_dict_keys()
+            ignore_unexpected = [rf"\.{k}$" for k in hqq_keys]
+            existing = model._keys_to_ignore_on_load_unexpected or []
+            model._keys_to_ignore_on_load_unexpected = existing + ignore_unexpected
+
+            # For missing keys: scan checkpoint to find which modules have W_q (are HQQ-quantized),
+            # and suppress only their .weight keys. Also replace their weight with a scalar meta
+            # tensor to prevent _move_missing_keys_from_meta_to_device from allocating full-size
+            # tensors on GPU (which would cause OOM for large models).
+            self._setup_missing_key_filters(model, checkpoint_files)
+        else:
+            # Add the corresponding quant_config to each valid module for on-the-fly quantization.
+            # prepare_for_hqq_linear() also sets the right quantization config inside the model
+            # (model.config.quantization_config) and the layers (hqq_layer.quant_config)
+            model = prepare_for_hqq_linear(model, quantization_config=self.quantization_config)
 
     def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        if self.pre_quantized:
+            self._load_hqq_from_checkpoint(model)
         setattr(model, "is_hqq_quantized", True)
         setattr(model, "is_hqq_serializable", self.is_serializable())
         return model
 
+    def _load_hqq_from_checkpoint(self, model: "PreTrainedModel"):
+        """Load pre-quantized HQQ weights directly from checkpoint files."""
+        from collections import defaultdict
+
+        from safetensors import safe_open
+
+        from ..integrations.hqq import autoname_modules, name_to_linear_tag
+
+        # Determine target device from stored device_map
+        device_map = getattr(self, "device_map", None)
+        if isinstance(device_map, dict):
+            # Use the first non-cpu device from the map (values can be str, int, or torch.device)
+            devices = [torch.device(v) for v in device_map.values()]
+            cuda_devices = [d for d in devices if d.type != "cpu"]
+            target_device = cuda_devices[0] if cuda_devices else torch.device("cpu")
+        elif isinstance(device_map, str) and device_map not in ("cpu", "auto"):
+            target_device = torch.device(device_map)
+        else:
+            target_device = torch.device("cpu")
+
+        autoname_modules(model)
+        skip_modules = self.quantization_config.skip_modules
+        hqq_state_dict_keys = HQQLinear(None, None).state_dict_keys()
+
+        # Find which modules should be quantized
+        quantizable_modules = {}
+        for name, module in model.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                linear_tag = name_to_linear_tag(name)
+                if linear_tag not in skip_modules:
+                    quantizable_modules[name] = module
+
+        # Load the full state dict from checkpoint files
+        full_state_dict = {}
+        for ckpt_file in self._checkpoint_files:
+            if ckpt_file.endswith(".safetensors"):
+                with safe_open(ckpt_file, framework="pt") as f:
+                    for k in f.keys():
+                        full_state_dict[k] = f.get_tensor(k)
+            else:
+                import torch as torch_
+
+                full_state_dict.update(torch_.load(ckpt_file, map_location="cpu", weights_only=True))
+
+        # Group state dict by module
+        module_states = defaultdict(dict)
+        for key, value in full_state_dict.items():
+            # Find the module this key belongs to
+            for module_name in quantizable_modules:
+                if key.startswith(module_name + "."):
+                    param_name = key[len(module_name) + 1 :]
+                    if param_name in hqq_state_dict_keys:
+                        module_states[module_name][param_name] = value
+                    break
+
+        # Replace nn.Linear with HQQLinear for each quantizable module
+        for module_name, state in module_states.items():
+            if "W_q" not in state:
+                continue
+
+            hqq_layer = HQQLinear(
+                None,
+                None,
+                compute_dtype=self.dtype or torch.float16,
+                device="cpu",
+                initialize=False,
+            )
+
+            state["W_q"] = torch.nn.Parameter(state["W_q"], requires_grad=False)
+            hqq_layer.load_state_dict(state)
+
+            # Move to the correct device (HQQLinear.to() is a no-op, use .cuda() instead)
+            if target_device.type != "cpu":
+                hqq_layer.cuda(target_device)
+
+            if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+                hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+            if self.using_multi_gpu:
+                hqq_layer = self._patch_layer_for_multigpu(hqq_layer)
+
+            parent_name, _, child_name = module_name.rpartition(".")
+            parent = model.get_submodule(parent_name) if parent_name else model
+            setattr(parent, child_name, hqq_layer)
+
+        del full_state_dict
+
+        # Free any leftover GPU memory from replaced nn.Linear modules
+        import gc
+
+        gc.collect()
+        if target_device.type != "cpu":
+            torch.cuda.empty_cache()
+
     def is_serializable(self):
         return True
 
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 863242a695c6..228d0605cfd1 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -3204,26 +3204,27 @@ def get_device_properties() -> DeviceProperties:
     """
     Get environment device properties.
     """
-    if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
+    if (IS_CUDA_SYSTEM or IS_ROCM_SYSTEM) and torch.cuda.is_available():
         import torch
 
-        major, minor = torch.cuda.get_device_capability()
-        if IS_ROCM_SYSTEM:
-            return ("rocm", major, minor)
-        else:
-            return ("cuda", major, minor)
-    elif IS_XPU_SYSTEM:
+        if torch.cuda.is_available():
+            major, minor = torch.cuda.get_device_capability()
+            if IS_ROCM_SYSTEM:
+                return ("rocm", major, minor)
+            else:
+                return ("cuda", major, minor)
+    if IS_XPU_SYSTEM:
         import torch
 
-        # To get more info of the architecture meaning and bit allocation, refer to https://github.com/intel/llvm/blob/sycl/sycl/include/sycl/ext/oneapi/experimental/device_architecture.def
-        arch = torch.xpu.get_device_capability()["architecture"]
-        gen_mask = 0x000000FF00000000
-        gen = (arch & gen_mask) >> 32
-        return ("xpu", gen, None)
-    elif IS_NPU_SYSTEM:
+        if torch.xpu.is_available():
+            # To get more info of the architecture meaning and bit allocation, refer to https://github.com/intel/llvm/blob/sycl/sycl/include/sycl/ext/oneapi/experimental/device_architecture.def
+            arch = torch.xpu.get_device_capability()["architecture"]
+            gen_mask = 0x000000FF00000000
+            gen = (arch & gen_mask) >> 32
+            return ("xpu", gen, None)
+    if IS_NPU_SYSTEM:
         return ("npu", None, None)
-    else:
-        return (torch_device, None, None)
+    return (torch_device, None, None)
 
 
 def unpack_device_properties(
@@ -3529,9 +3530,8 @@ def _prepare_debugging_info(test_info, info):
     """Combine the information about the test and the call information to a patched function/method within it."""
 
     info = f"{test_info}\n\n{info}"
-    p = os.path.join(os.environ.get("_PATCHED_TESTING_METHODS_OUTPUT_DIR", ""), "captured_info.txt")
-    # TODO (ydshieh): This is not safe when we use pytest-xdist with more than 1 worker.
-    with open(p, "a") as fp:
+    output_path = _get_patched_testing_methods_output_file()
+    with output_path.open("a") as fp:
         fp.write(f"{info}\n\n{'=' * 120}\n\n")
 
     return info
@@ -3754,6 +3754,27 @@ def _parse_call_info(func, args, kwargs, call_argument_expressions, target_args)
     return info
 
 
+def _get_patched_testing_methods_output_file() -> Path:
+    """Return the output file used by patched assertion methods.
+
+    Under `pytest-xdist`, workers run in separate processes but can share the same output directory. Using a worker-
+    specific file avoids concurrent writes and resets clobbering each other's captured debugging information.
+    """
+
+    output_dir = Path(os.environ.get("_PATCHED_TESTING_METHODS_OUTPUT_DIR", ""))
+    worker_id = os.environ.get("PYTEST_XDIST_WORKER")
+    filename = f"captured_info_{worker_id}.txt" if worker_id else "captured_info.txt"
+    return output_dir / filename
+
+
+def _reset_patched_testing_methods_output_file() -> Path:
+    """Clear the output file used by patched assertion methods and return its path."""
+
+    output_path = _get_patched_testing_methods_output_file()
+    output_path.unlink(missing_ok=True)
+    return output_path
+
+
 def patch_testing_methods_to_collect_info():
     """
     Patch some methods (`torch.testing.assert_close`, `unittest.case.TestCase.assertEqual`, etc).
@@ -3761,8 +3782,7 @@ def patch_testing_methods_to_collect_info():
     This will allow us to collect the call information, e.g. the argument names and values, also the literal expressions
     passed as the arguments.
     """
-    p = os.path.join(os.environ.get("_PATCHED_TESTING_METHODS_OUTPUT_DIR", ""), "captured_info.txt")
-    Path(p).unlink(missing_ok=True)
+    _reset_patched_testing_methods_output_file()
 
     if is_torch_available():
         import torch
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 4e821dfd4e70..72f4080f7aff 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1415,6 +1415,11 @@ def _set_model_specific_special_tokens(self, special_tokens: dict[str, str | Add
         Args:
             special_tokens: Dictionary of {token_name: token_value}
         """
+        if isinstance(special_tokens, list):
+            raise ValueError(
+                "This model's tokenizer config uses the list-based `extra_special_tokens` format "
+                "introduced in transformers v5. Please upgrade: pip install 'transformers>=5.0.0'"
+            )
         self.SPECIAL_TOKENS_ATTRIBUTES = self.SPECIAL_TOKENS_ATTRIBUTES + list(special_tokens.keys())
         for key, value in special_tokens.items():
             if isinstance(value, (str, AddedToken)):
@@ -1700,6 +1705,13 @@ def from_pretrained(
                 else:
                     vocab_files["vocab_file"] = match.group()
 
+        error_message = (
+            f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+            "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+            f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+            f"containing all relevant files for a {cls.__name__} tokenizer."
+        )
+
         resolved_vocab_files = {}
         for file_id, file_path in vocab_files.items():
             if file_path is None:
@@ -1728,17 +1740,19 @@ def from_pretrained(
                     raise
                 except Exception:
                     # For any other exception, we throw a generic error.
-                    raise OSError(
-                        f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                        "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                        f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                        f"containing all relevant files for a {cls.__name__} tokenizer."
-                    )
+                    raise OSError(error_message)
                 commit_hash = extract_commit_hash(resolved_vocab_files[file_id], commit_hash)
 
-        for file_id, file_path in vocab_files.items():
-            if file_id not in resolved_vocab_files:
-                continue
+        loadable_file_ids = set(cls.vocab_files_names)
+        if loadable_file_ids and "tokenizer_file" in resolved_vocab_files:
+            loadable_file_ids.add("tokenizer_file")
+        loadable_file_ids.intersection_update(resolved_vocab_files)
+        if (
+            (local_files_only or is_local)
+            and loadable_file_ids
+            and all(resolved_vocab_files[file_id] is None for file_id in loadable_file_ids)
+        ):
+            raise OSError(error_message)
 
         return cls._from_pretrained(
             resolved_vocab_files,
@@ -3074,6 +3088,14 @@ def apply_chat_template(
             conversations = [conversation]
             is_batched = False
 
+        # Normalize: drop `content` from assistant messages when it is None.
+        # Some APIs (e.g. OpenAI) return content=None for tool-call-only messages, but many chat templates
+        # crash or produce wrong output (e.g. rendering literal "None") when they encounter it.
+        conversations = [
+            [{k: v for k, v in msg.items() if k != "content" or v is not None} for msg in conversation]
+            for conversation in conversations
+        ]
+
         if continue_final_message:
             if add_generation_prompt:
                 raise ValueError(
diff --git a/src/transformers/tokenization_utils_tokenizers.py b/src/transformers/tokenization_utils_tokenizers.py
index a700f0ad27cc..fe2119c5dda0 100644
--- a/src/transformers/tokenization_utils_tokenizers.py
+++ b/src/transformers/tokenization_utils_tokenizers.py
@@ -35,7 +35,7 @@
 
 from transformers.utils.hub import cached_file
 
-from .convert_slow_tokenizer import SpmConverter
+from .convert_slow_tokenizer import SpmConverter, bytes_to_unicode
 from .integrations.ggml import convert_gguf_tokenizer
 from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
 from .tokenization_utils_base import (
@@ -51,6 +51,7 @@
 
 
 logger = logging.get_logger(__name__)
+BYTE_TO_UNICODE = bytes_to_unicode()
 
 # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
 TOKENIZER_FILE = "tokenizer.json"
@@ -145,6 +146,8 @@ def convert_to_native_format(cls, trust_remote_code=False, **kwargs):
                 tok_from_file = TokenizerFast.from_file(fast_tokenizer_file)
 
             local_kwargs["post_processor"] = tok_from_file.post_processor
+            local_kwargs["pre_tokenizer"] = tok_from_file.pre_tokenizer
+            local_kwargs["decoder"] = tok_from_file.decoder
             local_kwargs["tokenizer_padding"] = tok_from_file.padding
             local_kwargs["tokenizer_truncation"] = tok_from_file.truncation
             # Preserve truncation and padding baked into tokenizer.json so that classes
@@ -337,6 +340,9 @@ def __init__(self, *args, **kwargs):
         tokenizer_object = kwargs.pop("tokenizer_object", None)
         gguf_file = kwargs.pop("gguf_file", None)
         fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
+        # Pop Rust tokenizer objects before super().__init__ deepcopies kwargs.
+        kwargs.pop("pre_tokenizer", None)
+        kwargs.pop("decoder", None)
         # Note: added_tokens_decoder is NOT popped - it's passed to super().__init__() for processing
         added_tokens_decoder = kwargs.get("added_tokens_decoder", {})
         # Store add_prefix_space before super().__init__() to ensure it's not overridden
@@ -726,9 +732,49 @@ def _convert_id_to_token(self, index: int) -> str | None:
     def _add_tokens(self, new_tokens: list[str | AddedToken], special_tokens=False) -> int:
         if special_tokens:
             return self._tokenizer.add_special_tokens(new_tokens)
-
+        new_tokens = self._maybe_encode_added_tokens_for_bytelevel(new_tokens)
         return self._tokenizer.add_tokens(new_tokens)
 
+    def _maybe_encode_added_tokens_for_bytelevel(self, new_tokens: list[str | AddedToken]) -> list[str | AddedToken]:
+        pre_tokenizer = getattr(self.backend_tokenizer, "pre_tokenizer", None)
+        decoder = getattr(self.backend_tokenizer, "decoder", None)
+        normalizer = getattr(self.backend_tokenizer, "normalizer", None)
+
+        def _contains_bytelevel(component: Any) -> bool:
+            if component is None:
+                return False
+            if component.__class__.__name__ == "ByteLevel":
+                return True
+            # Some tokenizers expose wrappers like `Sequence([... ByteLevel(...) ...])`.
+            # We use repr-based detection as these wrappers do not consistently expose
+            # iterable internals in the Python bindings.
+            return "ByteLevel(" in repr(component)
+
+        # Some ByteLevel tokenizers (e.g. GPT-2/Qwen families) may use ByteLevel pre-tokenizer/decoder
+        # without a ByteLevel normalizer. In this setup, raw unicode added tokens can decode incorrectly
+        # (e.g. U+010D -> '\r'). Encoding added token contents through the ByteLevel alphabet
+        # preserves roundtrip behavior.
+        if _contains_bytelevel(pre_tokenizer) and _contains_bytelevel(decoder) and not _contains_bytelevel(normalizer):
+            encoded_tokens: list[str | AddedToken] = []
+            for token in new_tokens:
+                if isinstance(token, AddedToken):
+                    encoded_content = "".join(BYTE_TO_UNICODE[b] for b in token.content.encode("utf-8"))
+                    encoded_tokens.append(
+                        AddedToken(
+                            encoded_content,
+                            single_word=token.single_word,
+                            lstrip=token.lstrip,
+                            rstrip=token.rstrip,
+                            normalized=token.normalized,
+                            special=token.special,
+                        )
+                    )
+                else:
+                    encoded_tokens.append("".join(BYTE_TO_UNICODE[b] for b in token.encode("utf-8")))
+            return encoded_tokens
+
+        return new_tokens
+
     def num_special_tokens_to_add(self, pair: bool = False) -> int:
         """
         Returns the number of added tokens when encoding a sequence with special tokens.
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f434d78d4040..db25e3c25fec 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -70,6 +70,7 @@
 from .integrations.liger import apply_liger_kernel
 from .integrations.neftune import activate_neftune, deactivate_neftune
 from .integrations.peft import MIN_PEFT_VERSION
+from .integrations.tensor_parallel import get_ep_sharded_param_names
 from .integrations.tpu import save_tpu_checkpoint, tpu_spmd_dataloader, wrap_model_xla_fsdp
 from .modelcard import TrainingSummary
 from .modeling_utils import PreTrainedModel, unwrap_model
@@ -726,7 +727,12 @@ def _build_accelerator_args(self, **kwargs) -> dict[str, Any]:
                 )
             args["parallelism_config"] = self.args.parallelism_config
 
-        if getattr(self.model, "tp_size", None) is not None and self.model.tp_size > 1:
+        # EP-sharded params are already DTensors on the EP mesh, not on a TP mesh.
+        if (
+            getattr(self.model, "tp_size", None) is not None
+            and self.model.tp_size > 1
+            and not getattr(self.model, "has_ep", False)
+        ):
             if self.args.parallelism_config is None:
                 if is_accelerate_available("1.12.0"):
                     if self.args.parallelism_config is None:
@@ -823,6 +829,11 @@ def create_accelerator_and_postprocess(self) -> None:
         # post accelerator creation setup
         if self.is_fsdp_enabled:
             fsdp_plugin = self.accelerator.state.fsdp_plugin
+            # EP-sharded experts must not be re-sharded by FSDP,  their params are DTensors on the EP mesh.
+            ep_param_names = get_ep_sharded_param_names(self.model)
+            if ep_param_names:
+                module_names = list({n.rsplit(".", 1)[0] for n in ep_param_names})
+                fsdp_plugin.ignored_modules = [self.model.get_submodule(n) for n in module_names]
             for param in ["limit_all_gathers", "activation_checkpointing"]:
                 setattr(fsdp_plugin, param, self.args.fsdp_config.get(param, getattr(fsdp_plugin, param)))
             if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
@@ -2958,6 +2969,8 @@ def prediction_step(
                 if has_labels or loss_without_labels:
                     with self.compute_loss_context_manager():
                         num_items_in_batch = self._get_num_items_in_batch([inputs], self.args.device)
+                        if self.args.use_liger_kernel and prediction_loss_only:
+                            inputs = {**inputs, "skip_logits": True}
                         loss, outputs = self.compute_loss(
                             model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch
                         )
@@ -3821,15 +3834,16 @@ def _save(self, output_dir: str | None = None, state_dict: dict | None = None) -
             if state_dict is None:
                 state_dict = self.model.state_dict()
 
-            if isinstance(self.accelerator.unwrap_model(self.model, keep_torch_compile=False), supported_classes):
-                self.accelerator.unwrap_model(self.model, keep_torch_compile=False).save_pretrained(
-                    output_dir, state_dict=state_dict
-                )
+            unwrapped_model = self.accelerator.unwrap_model(self.model, keep_torch_compile=False)
+            if isinstance(unwrapped_model, supported_classes):
+                unwrapped_model.save_pretrained(output_dir, state_dict=state_dict)
             else:
                 logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
                 safetensors.torch.save_file(
                     state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
                 )
+                if hasattr(unwrapped_model, "config") and unwrapped_model.config is not None:
+                    unwrapped_model.config.save_pretrained(output_dir)
         else:
             self.model.save_pretrained(output_dir, state_dict=state_dict)
 
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 30377f5f5a61..fc8252a339be 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -476,6 +476,29 @@ def __call__(self, model_output, labels, shift_labels=False):
         return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss
 
 
+def _compute_dataset_lengths(dataset, model_input_name: str) -> list[int]:
+    """
+    Computes the lengths of the dataset items. For Hugging Face datasets,
+    this leverages select_columns for better performance.
+    """
+    if not isinstance(dataset[0], (dict, BatchEncoding)) or model_input_name not in dataset[0]:
+        raise ValueError(
+            "Can only automatically infer lengths for datasets whose items are dictionaries with an "
+            f"'{model_input_name}' key."
+        )
+    if hasattr(dataset, "__len__") and len(dataset) > 50000:
+        logger.warning(
+            "Computing lengths of the dataset... This may take a while. "
+            "To avoid this, you can provide the length of each sample in a column and set `length_column_name`."
+        )
+
+    dataset_iterator = dataset
+    if hasattr(dataset, "select_columns"):
+        dataset_iterator = dataset.select_columns([model_input_name])
+
+    return [len(feature[model_input_name]) for feature in logging.tqdm(dataset_iterator, desc="Computing lengths")]
+
+
 def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, generator=None):
     """
     Return a list of indices so that each slice of `batch_size` consecutive indices correspond to elements of similar
@@ -531,12 +554,7 @@ def __init__(
         self.batch_size = batch_size
         if lengths is None:
             model_input_name = model_input_name if model_input_name is not None else "input_ids"
-            if not isinstance(dataset[0], (dict, BatchEncoding)) or model_input_name not in dataset[0]:
-                raise ValueError(
-                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
-                    f"'{model_input_name}' key."
-                )
-            lengths = [len(feature[model_input_name]) for feature in dataset]
+            lengths = _compute_dataset_lengths(dataset, model_input_name)
         elif isinstance(lengths, torch.Tensor):
             logger.info(
                 "If lengths is a torch.Tensor, LengthGroupedSampler will be slow. Converting lengths to list[int]..."
@@ -591,12 +609,7 @@ def __init__(
 
         if lengths is None:
             model_input_name = model_input_name if model_input_name is not None else "input_ids"
-            if not isinstance(dataset[0], (dict, BatchEncoding)) or model_input_name not in dataset[0]:
-                raise ValueError(
-                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
-                    f"'{model_input_name}' key."
-                )
-            lengths = [len(feature[model_input_name]) for feature in dataset]
+            lengths = _compute_dataset_lengths(dataset, model_input_name)
         elif isinstance(lengths, torch.Tensor):
             logger.info(
                 "If lengths is a torch.Tensor, DistributedLengthGroupedSampler will be slow. Converting lengths to"
diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py
index 419579891e35..da3fc87e6c3e 100644
--- a/src/transformers/utils/auto_docstring.py
+++ b/src/transformers/utils/auto_docstring.py
@@ -43,6 +43,7 @@
     "image_processing_pil_*.py",
     "image_processing_*.py",
     "feature_extractor_*.py",
+    "modular_*.py",
 ]
 
 PLACEHOLDER_TO_AUTO_MODULE = {
@@ -4097,7 +4098,49 @@ def _process_example_section(
     return example_docstring
 
 
-def auto_method_docstring(
+class _LazyDocClass:
+    """
+    Descriptor stored directly in ``cls.__dict__['__doc__']`` to defer class docstring
+    generation until the first ``cls.__doc__`` access.
+
+    Python's ``type.__doc__`` C-level getter checks whether the stored value has a
+    ``__get__`` method and, if so, calls it — exactly like normal descriptor dispatch.
+    This lets us intercept ``cls.__doc__`` without changing the class's metaclass.
+
+    On the first access the generator is invoked, the result is cached, and the descriptor
+    replaces itself with the plain string so that all subsequent lookups are zero-overhead.
+    """
+
+    def __init__(self, gen):
+        self._gen = gen
+        self._val = None
+
+    def __get__(self, obj, cls=None):
+        if self._val is None:
+            self._val = self._gen()
+            # Replace ourselves with the plain string so future accesses skip this
+            # descriptor entirely.
+            if cls is not None:
+                try:
+                    type.__setattr__(cls, "__doc__", self._val)
+                except (TypeError, AttributeError):
+                    pass
+        return self._val
+
+
+def _apply_lazy_doc(cls, doc_generator):
+    """
+    Store a lazy docstring generator on *cls*.
+
+    Sets ``cls.__doc__`` to a :class:`_LazyDocClass` descriptor.  Python's
+    ``type.__doc__`` C getter calls ``__get__`` on any descriptor it finds in the class
+    dict, so the generator is invoked transparently on first ``cls.__doc__`` access
+    without requiring any metaclass change.
+    """
+    cls.__doc__ = _LazyDocClass(doc_generator)
+
+
+def _generate_method_docstring(
     func,
     parent_class=None,
     custom_intro=None,
@@ -4107,16 +4150,22 @@ def auto_method_docstring(
     allowed_params=None,
 ):
     """
-    Wrapper that automatically generates docstring.
+    Pure helper that builds and returns the docstring string for *func*.
+
+    Unlike ``auto_method_docstring`` this function does **not** modify ``func`` and does
+    not return a wrapper — it simply returns the generated docstring as a ``str``.
     """
+    # Use the raw (unwrapped) function so we get the source-code docstring, not a
+    # previously auto-generated one.
+    raw_func = getattr(func, "__wrapped__", func)
 
     # Use inspect to retrieve the method's signature
-    sig = inspect.signature(func)
-    indent_level = get_indent_level(func) if not parent_class else get_indent_level(parent_class)
+    sig = inspect.signature(raw_func)
+    indent_level = get_indent_level(raw_func) if not parent_class else get_indent_level(parent_class)
 
     # Get model information
-    model_name_lowercase, class_name, config_class = _get_model_info(func, parent_class)
-    func_documentation = func.__doc__
+    model_name_lowercase, class_name, config_class = _get_model_info(raw_func, parent_class)
+    func_documentation = raw_func.__doc__
 
     if custom_args is not None and func_documentation is not None:
         func_documentation = "\n" + set_min_indent(custom_args.strip("\n"), 0) + "\n" + func_documentation
@@ -4129,13 +4178,13 @@ def auto_method_docstring(
         if not docstring.strip().endswith("\n"):
             docstring += "\n"
     else:
-        docstring = add_intro_docstring(func, class_name=class_name, indent_level=indent_level)
+        docstring = add_intro_docstring(raw_func, class_name=class_name, indent_level=indent_level)
 
     # Process Parameters section
     docstring += _process_parameters_section(
         func_documentation,
         sig,
-        func,
+        raw_func,
         class_name,
         model_name_lowercase,
         parent_class,
@@ -4153,7 +4202,7 @@ def auto_method_docstring(
     # Process Example section
     example_docstring = _process_example_section(
         func_documentation,
-        func,
+        raw_func,
         parent_class,
         class_name,
         model_name_lowercase,
@@ -4166,14 +4215,49 @@ def auto_method_docstring(
     # Format the docstring with the placeholders
     docstring = format_args_docstring(docstring, model_name_lowercase)
 
-    # Assign the dynamically generated docstring to the wrapper function
-    func.__doc__ = docstring
+    return docstring
+
+
+def auto_method_docstring(
+    func,
+    parent_class=None,
+    custom_intro=None,
+    custom_args=None,
+    checkpoint=None,
+    source_args_dict=None,
+    allowed_params=None,
+):
+    """
+    Wrapper that automatically generates a method docstring.
+
+    Methods must remain plain functions so that ``torch.compile`` / ``torch._dynamo``
+    can trace them without obstruction.  We therefore generate the docstring eagerly
+    and assign it directly to ``func.__doc__``, returning the original function
+    unchanged.  (Class-level docstrings use :class:`_LazyDocClass` instead and are
+    generated lazily on first ``cls.__doc__`` access.)
+    """
+    func.__doc__ = _generate_method_docstring(
+        func,
+        parent_class=parent_class,
+        custom_intro=custom_intro,
+        custom_args=custom_args,
+        checkpoint=checkpoint,
+        source_args_dict=source_args_dict,
+        allowed_params=allowed_params,
+    )
     return func
 
 
-def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=None):
+def _generate_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=None, _original_doc=None):
     """
-    Wrapper that automatically generates a docstring for classes based on their attributes and methods.
+    Pure helper that builds and returns the docstring string for *cls*.
+
+    Unlike ``auto_class_docstring`` this function does **not** modify *cls* and does not
+    return a wrapper — it simply returns the generated docstring as a ``str``.
+
+    *_original_doc* must be the raw source-code docstring captured **before** lazy setup so
+    that this function never calls ``cls.__doc__`` (which would recurse into the lazy
+    machinery).
     """
     # import here to avoid circular import
     from transformers.models import auto as auto_module
@@ -4185,43 +4269,43 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
     docstring_init = ""
     docstring_args = ""
     if "PreTrainedModel" in (x.__name__ for x in cls.__mro__):
-        docstring_init = auto_method_docstring(
+        docstring_init = _generate_method_docstring(
             cls.__init__, parent_class=cls, custom_args=custom_args, checkpoint=checkpoint
-        ).__doc__.replace("Args:", "Parameters:")
+        ).replace("Args:", "Parameters:")
     elif "ProcessorMixin" in (x.__name__ for x in cls.__mro__):
         is_processor = True
-        docstring_init = auto_method_docstring(
+        docstring_init = _generate_method_docstring(
             cls.__init__,
             parent_class=cls,
             custom_args=custom_args,
             checkpoint=checkpoint,
             source_args_dict=get_args_doc_from_source([ModelArgs, ImageProcessorArgs, ProcessorArgs]),
-        ).__doc__.replace("Args:", "Parameters:")
+        ).replace("Args:", "Parameters:")
     elif "ModelOutput" in (x.__name__ for x in cls.__mro__):
         # We have a data class
         is_dataclass = True
-        doc_class = cls.__doc__
+        doc_class = _original_doc
         if custom_args is None and doc_class:
             custom_args = doc_class
-        docstring_args = auto_method_docstring(
+        docstring_args = _generate_method_docstring(
             cls.__init__,
             parent_class=cls,
             custom_args=custom_args,
             checkpoint=checkpoint,
             source_args_dict=get_args_doc_from_source(ModelOutputArgs),
-        ).__doc__
+        )
     elif any("BaseImageProcessor" in x.__name__ for x in cls.__mro__):
         is_image_processor = True
-        docstring_init = auto_method_docstring(
+        docstring_init = _generate_method_docstring(
             cls.__init__,
             parent_class=cls,
             custom_args=custom_args,
             checkpoint=checkpoint,
             source_args_dict=get_args_doc_from_source(ImageProcessorArgs),
-        ).__doc__
+        )
     elif "PreTrainedConfig" in (x.__name__ for x in cls.__mro__):
         is_config = True
-        doc_class = cls.__doc__
+        doc_class = _original_doc
         if custom_args is None and doc_class:
             custom_args = doc_class
 
@@ -4237,14 +4321,14 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
                 k for k, v in getattr(ancestor, "__annotations__", {}).items() if get_origin(v) is not ClassVar
             }
         allowed_params = own_config_params if own_config_params else None
-        docstring_init = auto_method_docstring(
+        docstring_init = _generate_method_docstring(
             cls.__init__,
             parent_class=cls,
             custom_args=custom_args,
             checkpoint=checkpoint,
             source_args_dict=get_args_doc_from_source([ConfigArgs]),
             allowed_params=allowed_params,
-        ).__doc__
+        )
 
     indent_level = get_indent_level(cls)
     model_name_lowercase = get_model_name(cls)
@@ -4310,7 +4394,8 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
             # No init function, we have a data class
             docstring += docstring_args if docstring_args else "\nArgs:\n"
             source_args_dict = get_args_doc_from_source(ModelOutputArgs)
-            doc_class = cls.__doc__ if cls.__doc__ else ""
+            # Use the captured raw docstring to avoid recursing into the lazy machinery.
+            doc_class = _original_doc if _original_doc else ""
             documented_kwargs = parse_docstring(doc_class)[0]
             for param_name, param_type_annotation in cls.__annotations__.items():
                 param_type, optional = process_type_annotation(param_type_annotation, param_name)
@@ -4348,9 +4433,32 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
         print(
             f"You used `@auto_class_docstring` decorator on `{cls.__name__}` but this class is not part of the AutoMappings. Remove the decorator"
         )
-    # Assign the dynamically generated docstring to the wrapper class
-    cls.__doc__ = docstring
+        docstring = ""
 
+    return docstring
+
+
+def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=None):
+    """
+    Wrapper that automatically generates a docstring for classes lazily.
+
+    Stores a generator on *cls* that produces the full docstring on first ``cls.__doc__``
+    access rather than at decoration / import time.
+    """
+    # Capture the raw source-code docstring **before** any lazy machinery is attached so
+    # that the generator closure can use it safely without risking re-entry.
+    original_doc = cls.__dict__.get("__doc__")
+
+    def _generator():
+        return _generate_class_docstring(
+            cls,
+            custom_intro=custom_intro,
+            custom_args=custom_args,
+            checkpoint=checkpoint,
+            _original_doc=original_doc,
+        )
+
+    _apply_lazy_doc(cls, _generator)
     return cls
 
 
@@ -4363,6 +4471,18 @@ def auto_docstring(obj=None, *, custom_intro=None, custom_args=None, checkpoint=
     for common arguments (like `input_ids`, `attention_mask`, etc.), and generates complete documentation
     including examples and return value descriptions.
 
+    **Lazy generation for classes** — class docstrings are generated on the *first* access of ``cls.__doc__``,
+    not at decoration / import time.  This means the cost is paid only when documentation is actually needed
+    (e.g. when Sphinx builds the docs or ``help()`` is called), keeping import times fast.
+
+    - For **classes** the decorator stores a :class:`_LazyDocClass` descriptor in ``cls.__dict__['__doc__']``.
+      Python's ``type.__doc__`` C getter calls ``__get__`` on that descriptor transparently; no metaclass change
+      is required.  After the first access the descriptor replaces itself with the plain generated string so
+      subsequent accesses are zero-overhead.
+    - For **methods / functions** the docstring is generated eagerly at decoration time and assigned directly
+      to ``func.__doc__``.  The function itself is returned unchanged, ensuring full compatibility with
+      ``torch.compile`` / ``torch._dynamo`` and ``inspect.signature``.
+
     For complete documentation and examples, read this [guide](https://huggingface.co/docs/transformers/auto_docstring).
 
     Examples of usage:
@@ -4499,6 +4619,8 @@ class MyModelOutput(ImageClassifierOutput):
         - For model classes, the decorator derives parameter descriptions from the `__init__` method's signature
           and docstring.
         - Return value documentation is automatically generated for methods that return ModelOutput subclasses.
+        - Decorated methods remain plain functions (``inspect.isfunction`` returns ``True``) and are fully
+          compatible with ``torch.compile`` / ``torch._dynamo``.
     """
 
     def auto_docstring_decorator(obj):
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index de11d23cbecf..c46367c4e55b 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -948,7 +948,7 @@ def is_flash_attn_2_available() -> bool:
     is_available, flash_attn_version = _is_package_available("flash_attn", return_version=True)
     # FA4 is also distributed under "flash_attn", hence we need to check the naming here
     is_available = is_available and "flash-attn" in [
-        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING.get("flash_attn", [])
     ]
 
     if not is_available or not (is_torch_cuda_available() or is_torch_mlu_available()):
@@ -964,10 +964,10 @@ def is_flash_attn_2_available() -> bool:
 @lru_cache
 def is_flash_attn_3_available() -> bool:
     # Universally available under `flash_attn_interface`
-    is_available = _is_package_available("flash_attn_interface")[0]
+    is_available = _is_package_available("flash_attn")[0]
     # Resolving and ensuring the proper name of FA3 being associated
     is_available = is_available and "flash-attn-3" in [
-        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING["flash_attn_interface"]
+        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING.get("flash_attn", [])
     ]
     return is_available and is_torch_cuda_available()
 
@@ -979,7 +979,7 @@ def is_flash_attn_4_available() -> bool:
     # NOTE: FA2 seems to distribute the `cute` subdirectory even if only FA2 has been installed
     #       -> check for the proper (normalized) distribution name
     is_available = is_available and "flash-attn-4" in [
-        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING.get("flash_attn", [])
     ]
 
     return is_available and is_torch_cuda_available()
@@ -990,7 +990,7 @@ def is_flash_attn_greater_or_equal(library_version: str) -> bool:
     is_available, flash_attn_version = _is_package_available("flash_attn", return_version=True)
     # FA4 is also distributed under "flash_attn", hence we need to check the naming here
     is_available = is_available and "flash-attn" in [
-        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING.get("flash_attn", [])
     ]
 
     if not is_available:
@@ -2582,13 +2582,20 @@ def wrapper(*args, **kwargs):
 BASE_FILE_REQUIREMENTS = {
     lambda name, content: "modeling_" in name: ("torch",),
     lambda name, content: "tokenization_" in name and name.endswith("_fast"): ("tokenizers",),
-    lambda name, content: "image_processing_" in name and "TorchvisionBackend" in content: (
+    lambda name, content: (
+        "image_processing_" in name and "TorchvisionBackend" in content and "image_processing_pil_" not in name
+    ): (
         "vision",
         "torch",
         "torchvision",
     ),
     lambda name, content: "image_processing_" in name: ("vision",),
-    lambda name, content: "video_processing_" in name: ("vision", "torch", "torchvision"),
+    lambda name, content: "video_processing_" in name and "video_processing_pil_" not in name: (
+        "vision",
+        "torch",
+        "torchvision",
+    ),
+    lambda name, content: "video_processing_pil_" in name: ("vision", "torch"),
 }
 
 
@@ -2634,6 +2641,13 @@ def fetch__all__(file_content) -> list[str]:
         return _all
 
 
+def _normalize_pil_backends(module_name: str, backends: tuple[str, ...]) -> tuple[str, ...]:
+    # PIL-specific processors should not require torchvision.
+    if "image_processing_pil_" in module_name or "video_processing_pil_" in module_name:
+        return tuple(backend for backend in backends if backend != "torchvision")
+    return backends
+
+
 @lru_cache
 def create_import_structure_from_path(module_path):
     """
@@ -2797,7 +2811,8 @@ def create_import_structure_from_path(module_path):
                     else:
                         backends = ()
 
-                    backends = frozenset(backends + base_requirements)
+                    backends = _normalize_pil_backends(module_name, backends + base_requirements)
+                    backends = frozenset(backends)
                     if backends not in module_requirements:
                         module_requirements[backends] = {}
                     if module_name not in module_requirements[backends]:
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index bf085d87498c..41322347c7ea 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -302,9 +302,11 @@ def __init__(
         view_as_float: bool = False,
         axis: int | None = None,
         dynamic_config: dict | None = None,
-        skip_modules: list[str] = ["lm_head"],
+        skip_modules: list[str] | None = None,
         **kwargs,
     ):
+        if skip_modules is None:
+            skip_modules = ["lm_head"]
         if is_hqq_available():
             from hqq.core.quantize import BaseQuantizeConfig as HQQBaseQuantizeConfig
         else:
@@ -946,13 +948,19 @@ def __init__(
         in_features: int = -1,
         indices_as_float: bool = False,
         is_indice_packed: bool = True,
-        num_centroids: list = [-1, -1],
-        num_res_centroids: list = [-1, -1],
+        num_centroids: list | None = None,
+        num_res_centroids: list | None = None,
         out_features: int = -1,
         outlier_size: int = 0,
-        vector_lens: list = [-1, -1],
+        vector_lens: list | None = None,
         **kwargs,
     ):
+        if num_centroids is None:
+            num_centroids = [-1, -1]
+        if num_res_centroids is None:
+            num_res_centroids = [-1, -1]
+        if vector_lens is None:
+            vector_lens = [-1, -1]
         self.enable_norm = enable_norm
         self.enable_perm = enable_perm
         self.group_num = group_num
@@ -994,11 +1002,15 @@ class VptqConfig(QuantizationConfigMixin):
     def __init__(
         self,
         enable_proxy_error: bool = False,
-        config_for_layers: dict[str, Any] = {},
-        shared_layer_config: dict[str, Any] = {},
+        config_for_layers: dict[str, Any] | None = None,
+        shared_layer_config: dict[str, Any] | None = None,
         modules_to_not_convert: list | None = None,
         **kwargs,
     ):
+        if config_for_layers is None:
+            config_for_layers = {}
+        if shared_layer_config is None:
+            shared_layer_config = {}
         self.quant_method = QuantizationMethod.VPTQ
         self.enable_proxy_error = enable_proxy_error
         self.config_for_layers: dict[str, Any] = config_for_layers
@@ -1912,9 +1924,11 @@ def __init__(
         weight_scale_2d: bool = False,
         weight_scale_rule: str | None = None,
         module_config_overrides: dict[str, dict[str, Any]] | None = None,
-        modules_to_not_convert: list[str] | None = ["lm_head"],
+        modules_to_not_convert: list[str] | None = None,
         **kwargs,
     ):
+        if modules_to_not_convert is None:
+            modules_to_not_convert = ["lm_head"]
         self.quant_method = QuantizationMethod.FOUR_OVER_SIX
 
         self.activation_dtype = activation_dtype
diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index 08d4697683b2..0fe4a4e9eed4 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -132,6 +132,18 @@ def tensor_type_validator(value: str | TensorType | None = None):
         raise ValueError(f"The tensor type should be one of {possible_names} but got tensor_type={value}")
 
 
+@as_validated_field
+def dtype_validator(value: str | int | None = None):
+    # Check all possible values
+    if value is None or (is_torch_available() and isinstance(value, torch.dtype)) or isinstance(value, str):
+        pass
+    # If torch not installed in env, just pass
+    elif not is_torch_available():
+        pass
+    else:
+        raise ValueError(f"Dtype must be either an string or `torch.dtype`, but got dtype={value}")
+
+
 @as_validated_field
 def label_to_id_validation(value: str | TensorType | None = None):
     possible_names = ["pt", "np", "mlx"]
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
new file mode 100644
index 000000000000..521e8f1c9db5
--- /dev/null
+++ b/tests/benchmarks/conftest.py
@@ -0,0 +1,15 @@
+"""
+Conftest for benchmarks: provide a no-op ``benchmark`` fixture so that benchmark
+tests are skipped (rather than erroring) when ``pytest-benchmark`` is not installed.
+"""
+
+import pytest
+
+
+try:
+    import pytest_benchmark  # noqa: F401
+except ImportError:
+    # Provide a stub fixture that skips gracefully.
+    @pytest.fixture
+    def benchmark(request):
+        pytest.skip("pytest-benchmark not installed (pip install pytest-benchmark)")
diff --git a/tests/benchmarks/test_lazy_docstring_benchmarks.py b/tests/benchmarks/test_lazy_docstring_benchmarks.py
new file mode 100644
index 000000000000..6fa3709c92d9
--- /dev/null
+++ b/tests/benchmarks/test_lazy_docstring_benchmarks.py
@@ -0,0 +1,167 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Benchmarks for the lazy-docstring machinery introduced in ``auto_docstring.py``.
+
+Run with::
+
+    pip install pytest-benchmark
+    pytest tests/benchmarks/test_lazy_docstring_benchmarks.py -v --benchmark-only
+
+These benchmarks are **informational** — they assert nothing about absolute
+thresholds.  Use them to compare before/after performance of ``auto_docstring``
+changes, or to spot regressions in import / doc-access paths.
+"""
+
+import importlib
+import sys
+
+import pytest
+
+
+try:
+    import pytest_benchmark  # noqa: F401
+
+    HAS_BENCHMARK = True
+except ImportError:
+    HAS_BENCHMARK = False
+
+pytestmark = pytest.mark.skipif(
+    not HAS_BENCHMARK, reason="pytest-benchmark not installed (pip install pytest-benchmark)"
+)
+
+
+# ---------------------------------------------------------------------------
+# 1. Module import time
+# ---------------------------------------------------------------------------
+
+
+def _do_import_image_processing():
+    """Re-import ``image_processing_utils`` from scratch each round."""
+    sys.modules.pop("transformers.image_processing_utils", None)
+    importlib.import_module("transformers.image_processing_utils")
+
+
+@pytest.mark.benchmark(group="import")
+def test_import_image_processing(benchmark):
+    """Measure how long it takes to import ``transformers.image_processing_utils``.
+
+    A significant portion of this time used to be docstring generation; with the
+    lazy approach that cost is deferred until ``__doc__`` is first accessed.
+    """
+    # Warm-up: ensure everything except the target module is already cached.
+    import transformers.image_processing_utils  # noqa: F401
+
+    benchmark(_do_import_image_processing)
+
+
+# ---------------------------------------------------------------------------
+# 2. Class ``__doc__`` access — first (generates) vs cached
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark(group="doc_access")
+def test_class_doc_first_access(benchmark):
+    """Measure the cost of the *first* ``cls.__doc__`` access (triggers generation).
+
+    Because ``_LazyDocClass.__get__`` replaces itself with a plain string after the
+    first call, subsequent benchmarks in this process will measure the cached path.
+    Run with ``--benchmark-disable-gc`` for reproducible timings.
+    """
+    from transformers.image_processing_utils import BaseImageProcessor
+
+    # Reset the lazy state so every round re-generates.
+    from transformers.utils.auto_docstring import auto_class_docstring
+
+    def setup():
+        auto_class_docstring(BaseImageProcessor)
+
+    def access():
+        return BaseImageProcessor.__doc__
+
+    benchmark.pedantic(access, setup=setup, rounds=10, iterations=1)
+
+
+@pytest.mark.benchmark(group="doc_access")
+def test_class_doc_cached_access(benchmark):
+    """Measure the cost of accessing ``cls.__doc__`` after it has been generated.
+
+    After the first access the lazy descriptor replaces itself with a plain string,
+    so this path should be essentially free.
+    """
+    from transformers.image_processing_utils import BaseImageProcessor
+
+    # Ensure doc is already generated (cached).
+    _ = BaseImageProcessor.__doc__
+
+    benchmark(lambda: BaseImageProcessor.__doc__)
+
+
+# ---------------------------------------------------------------------------
+# 3. Method ``__doc__`` access
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark(group="doc_access")
+def test_method_doc_access(benchmark):
+    """Measure ``method.__doc__`` access cost after eager decoration.
+
+    Methods are decorated eagerly (``func.__doc__`` is set at decoration time and
+    the original function is returned unchanged).  Subsequent reads are a plain
+    attribute lookup — essentially free.
+    """
+    from transformers.utils.auto_docstring import auto_method_docstring
+
+    def _dummy(x: int, y: int = 0) -> int:
+        r"""x (`int`): First number.\ny (`int`, *optional*): Second number."""
+        return x + y
+
+    _dummy.__qualname__ = "DummyClass.forward"  # appear as a method to auto_method_docstring
+    auto_method_docstring(_dummy)
+
+    benchmark(lambda: _dummy.__doc__)
+
+
+# ---------------------------------------------------------------------------
+# 4. ``from_pretrained`` with a tiny model (end-to-end smoke benchmark)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark(group="from_pretrained")
+@pytest.mark.slow
+def test_from_pretrained_tiny_llama(benchmark):
+    """Measure ``LlamaForCausalLM.from_pretrained`` on a tiny random model.
+
+    This is a *slow* benchmark (marked with ``@pytest.mark.slow``) that requires
+    network access and PyTorch.  It is skipped by default unless ``RUN_SLOW=1``
+    is set.  Run with::
+
+        RUN_SLOW=1 pytest tests/benchmarks/test_lazy_docstring_benchmarks.py \
+            -k test_from_pretrained_tiny_llama -v --benchmark-only
+    """
+    import os
+
+    if not os.environ.get("RUN_SLOW"):
+        pytest.skip("Set RUN_SLOW=1 to run this benchmark")
+
+    try:
+        from transformers import LlamaForCausalLM
+    except ImportError:
+        pytest.skip("PyTorch is required for this benchmark")
+
+    benchmark(
+        LlamaForCausalLM.from_pretrained,
+        "hf-internal-testing/tiny-random-LlamaForCausalLM",
+        low_cpu_mem_usage=False,
+    )
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index 83f170a4d555..c4b5636a618c 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -624,6 +624,11 @@ def test_eta_dist_warper(self):
         # first batch should keep 2 tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
         self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [2, 2])
 
+        # eta warper should fail fast when a previous processor fully masked a row.
+        fully_masked_scores = torch.full((1, vocab_size), -float("inf"), device=torch_device, dtype=torch.float)
+        with self.assertRaisesRegex(ValueError, "all logits set to -inf"):
+            eta_warp(input_ids, fully_masked_scores)
+
     def test_no_repeat_ngram_dist_processor(self):
         vocab_size = 3
         batch_size = 2
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 15df7036eb35..f272b7c344c8 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2893,6 +2893,35 @@ def emit(self, record):
         finally:
             logger.removeHandler(warningHandler)
 
+    def test_inputs_embeds_warn_without_ids_for_token_based_processors(self):
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device).eval()
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        inputs = tokenizer("Hello world", return_tensors="pt").to(torch_device)
+        embeds = model.get_input_embeddings()(inputs["input_ids"])
+
+        outputs_without_penalty = model.generate(inputs_embeds=embeds, max_new_tokens=5, repetition_penalty=1.0)
+        self.assertEqual(outputs_without_penalty.shape[0], inputs["input_ids"].shape[0])
+
+        with self.assertWarnsRegex(UserWarning, "repetition_penalty"):
+            outputs_with_ignored_penalty = model.generate(
+                inputs_embeds=embeds, max_new_tokens=5, repetition_penalty=1.1
+            )
+        self.assertEqual(outputs_with_ignored_penalty.shape[0], inputs["input_ids"].shape[0])
+
+        with self.assertWarnsRegex(UserWarning, "no_repeat_ngram_size"):
+            outputs_with_ignored_ngram = model.generate(inputs_embeds=embeds, max_new_tokens=5, no_repeat_ngram_size=2)
+        self.assertEqual(outputs_with_ignored_ngram.shape[0], inputs["input_ids"].shape[0])
+
+        outputs = model.generate(
+            input_ids=inputs["input_ids"],
+            inputs_embeds=embeds,
+            attention_mask=inputs.get("attention_mask"),
+            max_new_tokens=5,
+            repetition_penalty=1.1,
+            no_repeat_ngram_size=2,
+        )
+        self.assertEqual(outputs.shape[0], inputs["input_ids"].shape[0])
+
     @slow
     def test_beam_search_early_stop_heuristic(self):
         """Regression test for #38778 (early stopping needs to be tracked at a batch level)"""
diff --git a/tests/models/auto/test_image_processing_auto.py b/tests/models/auto/test_image_processing_auto.py
index 886292830678..ec243b07cc48 100644
--- a/tests/models/auto/test_image_processing_auto.py
+++ b/tests/models/auto/test_image_processing_auto.py
@@ -18,6 +18,7 @@
 import tempfile
 import unittest
 from pathlib import Path
+from unittest.mock import patch
 
 import transformers
 from transformers import (
@@ -291,6 +292,17 @@ def test_backend_kwarg_pil(self):
             image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="pil")
             self.assertIsInstance(image_processor, ViTImageProcessorPil)
 
+    @require_vision
+    def test_auto_backend_falls_back_to_pil_when_torchvision_is_unavailable(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
+            json.dump({"image_processor_type": "Gemma3ImageProcessor"}, open(processor_tmpfile, "w"))
+
+            with patch("transformers.models.auto.image_processing_auto.is_torchvision_available", return_value=False):
+                image_processor = AutoImageProcessor.from_pretrained(tmpdirname)
+
+        self.assertEqual(type(image_processor).__name__, "Gemma3ImageProcessorPil")
+
     @require_torchvision
     def test_backend_kwarg_torchvision(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index c029ae2cf97d..a8185b55597a 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -498,6 +498,46 @@ def __init__(self, tokenizer, decoder_tokenizer, image_processor):
             # Verify image processor loaded correctly
             self.assertEqual(loaded_processor.image_processor.size, image_processor.size)
 
+    def test_processor_from_pretrained_with_prebuilt_tokenizer_kwarg(self):
+        class SingleTokenizerProcessor(ProcessorMixin):
+            def __init__(self, bpe_tokenizer):
+                super().__init__(bpe_tokenizer)
+
+        class DualTokenizerProcessor(ProcessorMixin):
+            def __init__(self, bpe_tokenizer, decoder_tokenizer):
+                super().__init__(bpe_tokenizer, decoder_tokenizer)
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM")
+
+        self.assertEqual(
+            SingleTokenizerProcessor._pop_prebuilt_subprocessors({"tokenizer": tokenizer}),
+            {"bpe_tokenizer": tokenizer},
+        )
+        ambiguous_kwargs = {"tokenizer": tokenizer}
+        self.assertEqual(DualTokenizerProcessor._pop_prebuilt_subprocessors(ambiguous_kwargs), {})
+        self.assertIn("tokenizer", ambiguous_kwargs)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            SingleTokenizerProcessor(bpe_tokenizer=tokenizer).save_pretrained(tmp_dir)
+
+            loaded = SingleTokenizerProcessor.from_pretrained(tmp_dir, bpe_tokenizer=tokenizer)
+            self.assertIs(loaded.bpe_tokenizer, tokenizer)
+
+            loaded = SingleTokenizerProcessor.from_pretrained(tmp_dir, tokenizer=tokenizer)
+            self.assertIs(loaded.bpe_tokenizer, tokenizer)
+
+            loaded, unused = SingleTokenizerProcessor.from_pretrained(
+                tmp_dir, tokenizer=tokenizer, return_unused_kwargs=True
+            )
+            self.assertIs(loaded.bpe_tokenizer, tokenizer)
+            self.assertNotIn("tokenizer", unused)
+
+            loaded, unused = SingleTokenizerProcessor.from_pretrained(
+                tmp_dir, bpe_tokenizer=tokenizer, return_unused_kwargs=True
+            )
+            self.assertIs(loaded.bpe_tokenizer, tokenizer)
+            self.assertNotIn("bpe_tokenizer", unused)
+
     def test_processor_with_multiple_image_processors_save_load(self):
         """Test that processors with multiple image processors save and load correctly."""
 
diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
index 5e584a55b21f..2330f0eb3d9d 100644
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@@ -45,7 +45,6 @@
 from transformers.models.auto.configuration_auto import CONFIG_MAPPING, AutoConfig
 from transformers.models.auto.tokenization_auto import (
     REGISTERED_FAST_ALIASES,
-    REGISTERED_TOKENIZER_CLASSES,
     TOKENIZER_MAPPING,
     TOKENIZER_MAPPING_NAMES,
     get_tokenizer_config,
@@ -337,6 +336,27 @@ def test_auto_tokenizer_from_mistral_patching(self):
             "mistralai/Ministral-3-3B-Instruct-2512", fix_mistral_regex=True
         )  # should not error
 
+    @require_tokenizers
+    def test_auto_tokenizer_mistral_patching_applies_pretokenizer(self):
+        """Verify fix_mistral_regex=True actually patches the pre_tokenizer without AttributeError."""
+        import tokenizers
+
+        tokenizer = AutoTokenizer.from_pretrained("mistralai/Ministral-3-3B-Instruct-2512")
+        # Create a temp config with an old transformers_version so the patching code path is exercised
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config_path = os.path.join(tmp_dir, "config.json")
+            with open(config_path, "w", encoding="utf-8") as f:
+                json.dump({"model_type": "mistral", "transformers_version": "4.50.0"}, f)
+
+            patched = TokenizersBackend._patch_mistral_regex(
+                tokenizer._tokenizer,
+                tmp_dir,
+                is_local=True,
+                fix_mistral_regex=True,
+            )
+        self.assertTrue(getattr(patched, "fix_mistral_regex", False))
+        self.assertIsInstance(patched.pre_tokenizer, tokenizers.pre_tokenizers.Sequence)
+
     @require_tokenizers
     def test_auto_tokenizer_loads_bloom_repo_without_tokenizer_class(self):
         tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-BloomForCausalLM")
@@ -395,7 +415,6 @@ def test_new_tokenizer_registration(self):
                 del CONFIG_MAPPING._extra_content["custom"]
             if CustomConfig in TOKENIZER_MAPPING._extra_content:
                 del TOKENIZER_MAPPING._extra_content[CustomConfig]
-            REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)
 
     @require_tokenizers
     def test_new_tokenizer_fast_registration(self):
@@ -440,8 +459,6 @@ def test_new_tokenizer_fast_registration(self):
                 del CONFIG_MAPPING._extra_content["custom"]
             if CustomConfig in TOKENIZER_MAPPING._extra_content:
                 del TOKENIZER_MAPPING._extra_content[CustomConfig]
-            REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizer", None)
-            REGISTERED_TOKENIZER_CLASSES.pop("CustomTokenizerFast", None)
             REGISTERED_FAST_ALIASES.pop("CustomTokenizer", None)
 
     def test_from_pretrained_dynamic_tokenizer(self):
@@ -554,7 +571,6 @@ class NewTokenizer(BertTokenizer):
                 del CONFIG_MAPPING._extra_content["custom"]
             if CustomConfig in TOKENIZER_MAPPING._extra_content:
                 del TOKENIZER_MAPPING._extra_content[CustomConfig]
-            REGISTERED_TOKENIZER_CLASSES.pop("NewTokenizer", None)
 
     def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
         tokenizer = AutoTokenizer.from_pretrained(
@@ -765,3 +781,17 @@ def test_mismatched_model_type_uses_config_tokenizer_class_without_sentencepiece
                 revision="f8d333a098d19b4fd9a8b18f94170487ad3f821d",
             )
             self.assertEqual(tokenizer.__class__.__name__, "NllbTokenizer")
+
+    @require_tokenizers
+    def test_models_with_incorrect_hub_tokenizer_class_use_tokenizers_backend(self):
+        """Regression test for https://github.com/huggingface/transformers/issues/45488.
+
+        DeepSeek-V3/R1 declare `tokenizer_class: LlamaTokenizerFast` in `tokenizer_config.json`
+        but ship a ByteLevel `tokenizer.json`. `LlamaTokenizerFast.__init__` overwrites the
+        pre-tokenizer with `Metaspace`, dropping all spaces from round-trip. The
+        `MODELS_WITH_INCORRECT_HUB_TOKENIZER_CLASS` override pins these model types to
+        `TokenizersBackend`; the dispatch in `AutoTokenizer.from_pretrained` must honor it.
+        """
+        tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
+        self.assertEqual(tokenizer.__class__.__name__, "TokenizersBackend")
+        self.assertEqual(tokenizer.decode(tokenizer.encode("hello world", add_special_tokens=False)), "hello world")
diff --git a/tests/models/blt/test_modeling_blt.py b/tests/models/blt/test_modeling_blt.py
index a3f50157b38a..fe2ca9555e69 100644
--- a/tests/models/blt/test_modeling_blt.py
+++ b/tests/models/blt/test_modeling_blt.py
@@ -20,6 +20,7 @@
 
 from transformers import AutoTokenizer, is_torch_available
 from transformers.testing_utils import (
+    Expectations,
     cleanup,
     require_torch,
     require_torch_accelerator,
@@ -343,7 +344,14 @@ def test_model_logits(self):
     def test_model_bf16(self):
         """Test Blt model with bfloat16 precision."""
         NUM_TOKENS_TO_GENERATE = 200
-        EXPECTED_TEXT = "my name is alex and i am a student at the university of michigan in the college of arts and sciences. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan m"
+        # fmt: off
+        EXPECTED_TEXT = Expectations(
+            {
+                (None, None): "my name is alex and i am a student at the university of michigan in the college of arts and sciences. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan m",
+                ("xpu", None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
+            }
+        )
+        # fmt: on
 
         prompt = "my name is"
 
@@ -360,7 +368,7 @@ def test_model_bf16(self):
         )
 
         output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(output_text, EXPECTED_TEXT)
+        self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())
 
     @slow
     @require_torch_bf16
@@ -473,7 +481,14 @@ def test_model_eager(self):
     def test_model_bf16_static_cache(self):
         """Test Blt model with bfloat16 precision and static cache."""
         NUM_TOKENS_TO_GENERATE = 200
-        EXPECTED_TEXT = "my name is alex and i am a student at the university of michigan in the college of arts and sciences. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan m"
+        # fmt: off
+        EXPECTED_TEXT = Expectations(
+            {
+                (None, None): "my name is alex and i am a student at the university of michigan in the college of arts and sciences. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan m",
+                ("xpu", None): "my name is alex and i am a student at the university of michigan. i am a senior majoring in computer science and minoring in mathematics. i am also a member of the michigan math club and the michigan computer s",
+            }
+        )
+        # fmt: on
 
         prompt = "my name is"
 
@@ -492,4 +507,4 @@ def test_model_bf16_static_cache(self):
         )
 
         output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        self.assertEqual(output_text, EXPECTED_TEXT)
+        self.assertEqual(output_text, EXPECTED_TEXT.get_expectation())
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 4dbc12f1a0f6..cbc2fff57222 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -570,6 +570,17 @@ def test_model_from_pretrained(self):
         model = CLIPModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @slow
+    def test_model_from_pretrained_ignores_position_ids_unexpected_keys(self):
+        _, loading_info = CLIPModel.from_pretrained(
+            "openai/clip-vit-base-patch32",
+            output_loading_info=True,
+        )
+
+        unexpected_keys = loading_info["unexpected_keys"]
+        self.assertNotIn("text_model.embeddings.position_ids", unexpected_keys)
+        self.assertNotIn("vision_model.embeddings.position_ids", unexpected_keys)
+
     @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
     @slow
     @is_flaky()
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index e2eeec9bfdfa..eabbe9194fdb 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -16,25 +16,19 @@
 import copy
 import inspect
 import math
-import os
-import re
-import tempfile
 import unittest
 from functools import cached_property
 
 from transformers import ConditionalDetrConfig, ResNetConfig, is_torch_available, is_vision_available
-from transformers.conversion_mapping import get_model_conversion_mapping
-from transformers.core_model_loading import WeightRenaming, process_target_pattern
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, compare_state_dicts, floats_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
     import torch
-    from safetensors.torch import load_file
 
     from transformers import (
         ConditionalDetrForObjectDetection,
@@ -240,88 +234,6 @@ def test_conditional_detr_object_detection_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_conditional_detr_object_detection_head_model(*config_and_inputs)
 
-    def test_reverse_loading_mapping(self, check_keys_were_modified=True):
-        # Some conversions from the mapping are specific to `DetrForSegmentation` model only
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        #  Some MoE models alternate between a classic MLP and a MoE layer, in which case we want to have at
-        # lest one MoE layer here to check the mapping
-        config_to_set = config.get_text_config(decoder=True)
-        config_to_set.first_k_dense_replace = 1  # means that the first layer (idx 0) will be MLP, then MoE
-        config_to_set.moe_layer_start_index = 1  # same as above but for Ernie 4.5...
-        config_to_set.mlp_only_layers = [0]  # same but for qwens
-        config_to_set.num_dense_layers = 1  # lfm2_moe
-
-        for model_class in self.all_model_classes:
-            # Each individual model is a subtest
-            with self.subTest(model_class.__name__):
-                model = model_class(copy.deepcopy(config))
-                # Skip if no conversions
-                conversions = get_model_conversion_mapping(model, add_legacy=False)
-                if len(conversions) == 0:
-                    # No conversion mapping for this model only, needs to test other classes
-                    continue
-
-                # Find the model keys, so the targets according to the conversions
-                model_keys = list(model.state_dict().keys())
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    # Serialize with reverse mapping
-                    model.save_pretrained(tmpdirname)
-                    state_dict = load_file(os.path.join(tmpdirname, "model.safetensors"))
-                    # Get all the serialized keys that we just saved according to the reverse mapping
-                    serialized_keys = list(state_dict.keys())
-
-                if check_keys_were_modified:
-                    # They should be different, otherwise we did not perform any mapping
-                    self.assertNotEqual(sorted(serialized_keys), sorted(model_keys), "No key mapping was performed!")
-
-                # Check that for each conversion entry, we at least map to one key
-                for conversion in conversions:
-                    for source_pattern in conversion.source_patterns:
-                        # Sometimes the mappings specify keys that are tied, so absent from the saved state dict
-                        if isinstance(conversion, WeightRenaming):
-                            # We need to revert the target pattern to make it compatible with regex search
-                            target_pattern_reversed = conversion.target_patterns[0]
-                            captured_group = process_target_pattern(source_pattern)[1]
-                            if captured_group:
-                                target_pattern_reversed = target_pattern_reversed.replace(r"\1", captured_group)
-                            if any(re.search(target_pattern_reversed, k) for k in model.all_tied_weights_keys.keys()):
-                                continue
-                        num_matches = sum(re.search(source_pattern, key) is not None for key in serialized_keys)
-
-                        # DIFF FROM MIXIN IS HERE
-                        if (
-                            "bbox" in source_pattern or "mask_head" in source_pattern
-                        ) and model_class != ConditionalDetrForSegmentation:
-                            pass
-                        else:
-                            self.assertTrue(
-                                num_matches > 0,
-                                f"`{source_pattern}` in `{conversion}` did not match any of the source keys. "
-                                "This indicates whether that the pattern is not properly written, or that it could not be reversed correctly",
-                            )
-
-                # If everything is still good at this point, let's test that we perform the same operations both when
-                # reverting ops from `from_pretrained` and from `__init__`
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    # The model was instantiated from __init__ before being saved
-                    model.save_pretrained(tmpdirname)
-                    state_dict_saved_from_init = load_file(os.path.join(tmpdirname, "model.safetensors"))
-
-                    # Now reload it
-                    model_reloaded = model_class.from_pretrained(tmpdirname)
-
-                    # Make sure both loaded state_dict are identical
-                    self.assertTrue(compare_state_dicts(model_reloaded.state_dict(), model.state_dict()))
-
-                    # The model was instantiated from `from_pretrained` before being saved
-                    model_reloaded.save_pretrained(tmpdirname)
-                    state_dict_saved_from_pretrained = load_file(os.path.join(tmpdirname, "model.safetensors"))
-
-                    # Make sure both saved state_dict are identical
-                    self.assertTrue(compare_state_dicts(state_dict_saved_from_init, state_dict_saved_from_pretrained))
-
     # TODO: check if this works again for PyTorch 2.x.y
     @unittest.skip(reason="Got `CUDA error: misaligned address` with PyTorch 2.0.0.")
     def test_multi_gpu_data_parallel_forward(self):
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index c4baec276f4f..f1a2fdbea70b 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -16,8 +16,6 @@
 import copy
 import inspect
 import math
-import os
-import re
 import tempfile
 import unittest
 from functools import cached_property
@@ -25,8 +23,6 @@
 from parameterized import parameterized
 
 from transformers import DetrConfig, ResNetConfig, is_torch_available, is_vision_available
-from transformers.conversion_mapping import get_model_conversion_mapping
-from transformers.core_model_loading import WeightRenaming, process_target_pattern
 from transformers.testing_utils import Expectations, require_timm, require_torch, require_vision, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
@@ -34,7 +30,6 @@
     TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION,
     ModelTesterMixin,
     _test_eager_matches_sdpa_inference,
-    compare_state_dicts,
     floats_tensor,
 )
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -42,7 +37,6 @@
 
 if is_torch_available():
     import torch
-    from safetensors.torch import load_file
 
     from transformers import DetrForObjectDetection, DetrForSegmentation, DetrModel
 
@@ -206,88 +200,6 @@ class DetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_missing_keys = False
     zero_init_hidden_state = True
 
-    def test_reverse_loading_mapping(self, check_keys_were_modified=True):
-        # Some conversions from the mapping are specific to `DetrForSegmentation` model only
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        #  Some MoE models alternate between a classic MLP and a MoE layer, in which case we want to have at
-        # lest one MoE layer here to check the mapping
-        config_to_set = config.get_text_config(decoder=True)
-        config_to_set.first_k_dense_replace = 1  # means that the first layer (idx 0) will be MLP, then MoE
-        config_to_set.moe_layer_start_index = 1  # same as above but for Ernie 4.5...
-        config_to_set.mlp_only_layers = [0]  # same but for qwens
-        config_to_set.num_dense_layers = 1  # lfm2_moe
-
-        for model_class in self.all_model_classes:
-            # Each individual model is a subtest
-            with self.subTest(model_class.__name__):
-                model = model_class(copy.deepcopy(config))
-                # Skip if no conversions
-                conversions = get_model_conversion_mapping(model, add_legacy=False)
-                if len(conversions) == 0:
-                    # No conversion mapping for this model only, needs to test other classes
-                    continue
-
-                # Find the model keys, so the targets according to the conversions
-                model_keys = list(model.state_dict().keys())
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    # Serialize with reverse mapping
-                    model.save_pretrained(tmpdirname)
-                    state_dict = load_file(os.path.join(tmpdirname, "model.safetensors"))
-                    # Get all the serialized keys that we just saved according to the reverse mapping
-                    serialized_keys = list(state_dict.keys())
-
-                if check_keys_were_modified:
-                    # They should be different, otherwise we did not perform any mapping
-                    self.assertNotEqual(sorted(serialized_keys), sorted(model_keys), "No key mapping was performed!")
-
-                # Check that for each conversion entry, we at least map to one key
-                for conversion in conversions:
-                    for source_pattern in conversion.source_patterns:
-                        # Sometimes the mappings specify keys that are tied, so absent from the saved state dict
-                        if isinstance(conversion, WeightRenaming):
-                            # We need to revert the target pattern to make it compatible with regex search
-                            target_pattern_reversed = conversion.target_patterns[0]
-                            captured_group = process_target_pattern(source_pattern)[1]
-                            if captured_group:
-                                target_pattern_reversed = target_pattern_reversed.replace(r"\1", captured_group)
-                            if any(re.search(target_pattern_reversed, k) for k in model.all_tied_weights_keys.keys()):
-                                continue
-                        num_matches = sum(re.search(source_pattern, key) is not None for key in serialized_keys)
-
-                        # DIFF FROM MIXIN IS HERE
-                        if (
-                            "bbox" in source_pattern or "mask_head" in source_pattern
-                        ) and model_class != DetrForSegmentation:
-                            pass
-                        else:
-                            self.assertTrue(
-                                num_matches > 0,
-                                f"`{source_pattern}` in `{conversion}` did not match any of the source keys. "
-                                "This indicates whether that the pattern is not properly written, or that it could not be reversed correctly",
-                            )
-
-                # If everything is still good at this point, let's test that we perform the same operations both when
-                # reverting ops from `from_pretrained` and from `__init__`
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    # The model was instantiated from __init__ before being saved
-                    model.save_pretrained(tmpdirname)
-                    state_dict_saved_from_init = load_file(os.path.join(tmpdirname, "model.safetensors"))
-
-                    # Now reload it
-                    model_reloaded = model_class.from_pretrained(tmpdirname)
-
-                    # Make sure both loaded state_dict are identical
-                    self.assertTrue(compare_state_dicts(model_reloaded.state_dict(), model.state_dict()))
-
-                    # The model was instantiated from `from_pretrained` before being saved
-                    model_reloaded.save_pretrained(tmpdirname)
-                    state_dict_saved_from_pretrained = load_file(os.path.join(tmpdirname, "model.safetensors"))
-
-                    # Make sure both saved state_dict are identical
-                    self.assertTrue(compare_state_dicts(state_dict_saved_from_init, state_dict_saved_from_pretrained))
-
     # special case for head models
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
@@ -602,6 +514,31 @@ def test_greyscale_images(self):
 
             self.assertTrue(outputs)
 
+    def test_nested_base_model_prefix_checkpoint_loading(self):
+        """Segmentation checkpoints load into Seg / OD / backbone without missing keys; backbone-only checkpoints load
+        without unexpected keys (nested `base_model_prefix` key resolution)."""
+        config = self.model_tester.get_config()
+
+        with tempfile.TemporaryDirectory() as seg_ckpt_dir:
+            DetrForSegmentation(config).save_pretrained(seg_ckpt_dir)
+            for model_class in (DetrForSegmentation, DetrForObjectDetection, DetrModel):
+                _, info = model_class.from_pretrained(seg_ckpt_dir, output_loading_info=True)
+                self.assertEqual(
+                    info["missing_keys"],
+                    set(),
+                    msg=f"Seg checkpoint -> {model_class.__name__}: missing_keys={sorted(info['missing_keys'])}",
+                )
+
+        with tempfile.TemporaryDirectory() as base_ckpt_dir:
+            DetrModel(config).save_pretrained(base_ckpt_dir)
+            for model_class in (DetrForSegmentation, DetrForObjectDetection, DetrModel):
+                _, info = model_class.from_pretrained(base_ckpt_dir, output_loading_info=True)
+                self.assertEqual(
+                    info["unexpected_keys"],
+                    set(),
+                    msg=f"DetrModel checkpoint -> {model_class.__name__}: unexpected_keys={sorted(info['unexpected_keys'])}",
+                )
+
     # override test_eager_matches_sdpa_inference to set use_attention_mask to False
     # as masks used in test are not adapted to the ones used in the model
     @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
diff --git a/tests/models/fast_vlm/test_modeling_fast_vlm.py b/tests/models/fast_vlm/test_modeling_fast_vlm.py
index f66f27b003bc..5e26b591f339 100644
--- a/tests/models/fast_vlm/test_modeling_fast_vlm.py
+++ b/tests/models/fast_vlm/test_modeling_fast_vlm.py
@@ -27,7 +27,9 @@
     is_vision_available,
 )
 from transformers.testing_utils import (
+    Expectations,
     cleanup,
+    require_deterministic_for_xpu,
     require_torch,
     require_vision,
     slow,
@@ -269,6 +271,7 @@ def test_small_model_integration_test(self):
         )
 
     @require_vision
+    @require_deterministic_for_xpu
     def test_small_model_integration_test_batch(self):
         model = FastVlmForConditionalGeneration.from_pretrained(
             "KamilaMila/FastVLM-0.5B", device_map=torch_device, dtype=torch.bfloat16
@@ -281,6 +284,7 @@ def test_small_model_integration_test_batch(self):
         image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
         image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
 
+        self.processor.tokenizer.padding_side = "left"
         inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True).to(
             torch_device,
             dtype=model.dtype,
@@ -288,14 +292,22 @@ def test_small_model_integration_test_batch(self):
 
         output = model.generate(**inputs, max_new_tokens=20)
 
-        EXPECTED_DECODED_TEXT = [
-            "user\n\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nassistant\n\nWhen visiting this serene place, it's essential to be mindful of the following:\n\n1. **",
-            "user\n\nWhat is this?\nassistant\nThe image depicts two cats lying on a pink surface, which could be a couch or a"
-        ]  # fmt: skip
+        EXPECTED_DECODED_TEXT = Expectations(
+            {
+                (None, None): [
+                    "user\n\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nassistant\n\nWhen visiting this serene place, it's essential to be mindful of the following:\n\n1. **",
+                    "user\n\nWhat is this?\nassistant\n\nThe image depicts two cats, one of which is a tabby, lying on a pink surface",
+                ],
+                ("xpu", None): [
+                    "user\n\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nassistant\n\nWhen visiting this serene place, it's essential to be mindful of the following:\n\n1. **",
+                    "user\n\nWhat is this?\nassistant\n\nThe image depicts two cats, one of which is a kitten, resting on a pink surface.",
+                ],
+            }
+        )
 
         self.assertEqual(
             self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
+            EXPECTED_DECODED_TEXT.get_expectation(),
         )
 
     def test_generation_no_images(self):
diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index 9d3924d13935..f6b204db5adb 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -17,6 +17,7 @@
 
 import pytest
 from parameterized import parameterized
+from pytest import mark
 
 from transformers import (
     AutoTokenizer,
@@ -27,8 +28,13 @@
 from transformers.testing_utils import (
     Expectations,
     cleanup,
+    require_deterministic_for_xpu,
+    require_flash_attn,
+    require_flash_attn_3,
+    require_flash_attn_4,
     require_torch,
     require_torch_accelerator,
+    require_torch_gpu,
     require_torch_multi_gpu,
     slow,
     torch_device,
@@ -110,6 +116,27 @@ def test_model_rope_scaling_from_config(self):
     def test_generate_from_random_inputs_embeds(self):
         pass
 
+    def test_use_cache_false_with_kv_sharing(self):
+        """Regression test: use_cache=False must produce the same logits as use_cache=True.
+
+        Gemma4 uses KV sharing (num_kv_shared_layers) where later layers reuse K/V from earlier
+        layers via the cache object. When use_cache=False the cache was not created, breaking the
+        sharing mechanism and causing receiver layers to use keys as values (garbage logits).
+        See https://github.com/huggingface/transformers/issues/45242
+        """
+        config = self.model_tester.get_config()
+        config.attention_k_eq_v = True
+        config.num_global_key_value_heads = config.num_key_value_heads
+        model = Gemma4ForCausalLM(config).to(torch_device).eval()
+        input_ids = ids_tensor([1, 16], config.vocab_size).to(torch_device)
+
+        with torch.no_grad():
+            out_cached = model(input_ids, use_cache=True)
+            out_uncached = model(input_ids, use_cache=False)
+
+        torch.testing.assert_close(out_cached.logits, out_uncached.logits, atol=1e-4, rtol=1e-4)
+        self.assertIsNone(out_uncached.past_key_values, "past_key_values should be None when use_cache=False")
+
     @unittest.skip(
         "Flaky on CI, but not locally on Mac. If model is set to fp32 instead of bf16, not flaky anymore."
         "TODO Cyril: investigate where the loss of precision between bf16 and fp32 comes from."
@@ -126,6 +153,20 @@ def test_tp_generation_quantized(self):
     def test_model_training(self):
         pass
 
+    @unittest.skip(
+        "Under non-bf16 dtypes, MoE grouped_mm falls back to "
+        "_grouped_mm_fallback_backward which is incompatible with torch.compile."
+    )
+    def test_flash_attn_2_can_compile_with_attention_mask_None_without_graph_break(self):
+        pass
+
+    @unittest.skip(
+        "Under non-bf16 dtypes, MoE grouped_mm falls back to "
+        "_grouped_mm_fallback_backward which is incompatible with torch.compile."
+    )
+    def test_torch_compile_for_training(self):
+        pass
+
 
 class Gemma4Audio2TextModelTester:
     def __init__(
@@ -470,6 +511,54 @@ def test_num_layers_is_small(self):
     def test_generate_from_random_inputs_embeds(self):
         pass
 
+    @require_flash_attn
+    @require_torch_accelerator
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_from_config(self):
+        # Gemma4 requires mm_token_type_ids in train mode, so we test in eval mode
+        self.flash_attn_from_config(attn_implementation="flash_attention_2", test_fwd_in_train=False)
+
+    @require_flash_attn_3
+    @require_torch_gpu
+    @mark.flash_attn_3_test
+    @slow
+    def test_flash_attn_3_from_config(self):
+        # Gemma4 requires mm_token_type_ids in train mode, so we test in eval mode
+        self.flash_attn_from_config(attn_implementation="flash_attention_3", test_fwd_in_train=False)
+
+    @require_flash_attn_4
+    @require_torch_gpu
+    @mark.flash_attn_4_test
+    @slow
+    def test_flash_attn_4_from_config(self):
+        # Gemma4 requires mm_token_type_ids in train mode, so we test in eval mode
+        self.flash_attn_from_config(attn_implementation="flash_attention_4", test_fwd_in_train=False)
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_2_inference_equivalence(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_3_inference_equivalence(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_3_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_4_inference_equivalence(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_4_inference_equivalence_right_padding(self):
+        pass
+
     @unittest.skip(
         "Randomly starts failing after module order changed in the __init__ because accelertate is not robust enough"
     )
@@ -516,6 +605,7 @@ def setUp(self):
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
+    @require_deterministic_for_xpu
     def test_model_with_image(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
@@ -534,11 +624,13 @@ def test_model_with_image(self):
         EXPECTED_TEXTS = Expectations(
             {
                 ("cuda", 8): ['This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background'],
+                ("xpu", 3): ['This image shows a **brown and white cow standing on a sandy beach near the ocean**.\n\nHere are some details about the image:\n\n*   '],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
+    @require_deterministic_for_xpu
     def test_model_with_image_batch(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
@@ -580,11 +672,16 @@ def test_model_with_image_batch(self):
                     "This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background",
                     "No, these images are not identical.\n\nThe first image is a photograph of a **brown and white cow standing on a beach** under a blue",
                 ],
+                ("xpu", 3): [
+                    "This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background",
+                    "No, these images are not identical.\n\nThe first image is a photograph of a **brown and white cow standing on a beach** under a blue",
+                ],
             }
         )
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
+    @require_deterministic_for_xpu
     def test_model_multiimage(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
@@ -614,6 +711,7 @@ def test_model_multiimage(self):
         EXPECTED_TEXTS = Expectations(
             {
                 ("cuda", 8): ['Based on the image, here is a description of what I see:\n\n**Foreground & Street Scene:**\n* **Traffic Sign:** The most prominent'],
+                ("xpu", 3): ['Based on the image, here is a description of what I see:\n\n**Foreground & Street Scene:**\n* **Roadway:** There is an'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@@ -647,6 +745,7 @@ def test_model_text_only_multigpu(self):
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
+    @require_deterministic_for_xpu
     def test_model_text_only(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map=torch_device)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side="left")
@@ -666,6 +765,7 @@ def test_model_text_only(self):
             {
                 ("cuda", (8, 0)): ['## The Algorithmic Mind\n\nA whisper starts, a seed unseen,\nOf data vast, a vibrant sheen.\nA sea of numbers,'],
                 ("cuda", (8, 6)): ['## The Algorithmic Mind\n\nA tapestry of data, vast and deep,\nWhere silent numbers in their slumber sleep.\nA sea of text'],
+                ("xpu", 3): ['## The Algorithmic Mind\n\nA whisper starts in silicon deep,\nWhere data streams in endless sweep.\nNo flesh and blood, no beating'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@@ -696,6 +796,7 @@ def test_states_sharing_with_and_without_cache(self):
 
     # Note: we do not test FA2 as the head dim is 512 on some layers, which is not compatible with the kernels
     @parameterized.expand([("sdpa",), ("eager",)])
+    @require_deterministic_for_xpu
     def test_generation_beyond_sliding_window(self, attn_implementation: str):
         """Test that we can correctly generate beyond the sliding window. Outputs for every attention functions
         should be coherent and identical.
@@ -734,7 +835,11 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
                 ("cuda", 8): [
                     "That sounds lovely! It seems like you're really enjoying the place you'",
                     "Here are a few ways you could use or expand upon that list, depending on",
-                ]
+                ],
+                ("xpu", 3): [
+                    "That sounds lovely! It seems like you're really enjoying the place you'",
+                    "Here are a few ways you could use or expand upon that list, depending on",
+                ],
             }
         )
         self.assertEqual(output_text, EXPECTED_COMPLETIONS.get_expectation())
diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py
index 8e409064320c..859aa8232851 100644
--- a/tests/models/gpt2/test_tokenization_gpt2.py
+++ b/tests/models/gpt2/test_tokenization_gpt2.py
@@ -84,6 +84,18 @@ def test_tokenization_tiktoken(self):
             tiktoken_fast_tokenizer.decode(rust_tokenizer.encode(sequence)),
         )
 
+    def test_added_tokens_unicode_roundtrip_with_bytelevel(self):
+        """Regression (#45051): added vocabulary with Unicode must encode/decode cleanly for ByteLevel without a normalizer."""
+        tokenizer = AutoTokenizer.from_pretrained(self.from_pretrained_id[0])
+        new_tokens = ["Začnimo", "kuća", "međa"]
+        tokenizer.add_tokens(new_tokens)
+
+        for word in new_tokens:
+            with self.subTest(word=word):
+                ids = tokenizer.encode(word, add_special_tokens=False)
+                decoded = tokenizer.decode(ids, skip_special_tokens=False)
+                self.assertEqual(decoded, word)
+
 
 @require_tokenizers
 class OPTTokenizationTest(unittest.TestCase):
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 1b56c8c6e5a8..6db2f45a341e 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -89,6 +89,14 @@ def test_load_balancing_loss(self):
         self.assertEqual(result.router_logits[0].shape, (91, config.num_local_experts))
         torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(2, dtype=torch.float32), rtol=1e-2, atol=1e-2)
 
+        # Verify router_logits are raw logits, not softmax probabilities (regression test for double-softmax bug)
+        for layer_logits in result.router_logits:
+            row_sums = layer_logits.sum(dim=-1)
+            self.assertFalse(
+                torch.allclose(row_sums, torch.ones_like(row_sums), atol=1e-3),
+                "router_logits should be raw logits (row sums != 1.0), not softmax probabilities",
+            )
+
         # First, we make sure that adding padding tokens doesn't change the loss
         # loss(input_ids, attention_mask=None) == loss(input_ids + padding, attention_mask=attention_mask_with_padding)
         pad_length = input_ids.shape[1] * 4
diff --git a/tests/models/nemotron_h/test_modeling_nemotron_h.py b/tests/models/nemotron_h/test_modeling_nemotron_h.py
index 6aed0bb1ac62..290961265d4f 100644
--- a/tests/models/nemotron_h/test_modeling_nemotron_h.py
+++ b/tests/models/nemotron_h/test_modeling_nemotron_h.py
@@ -386,8 +386,8 @@ def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_l
 
         # Check each layer has the correct shape
         for layer, layer_type in zip(past_key_values.layers, config.layer_types):
-            # Moe layers have a default mamba cache instantiated, but it stays empty as the layer does not use it
-            if layer_type == "moe":
+            # MoE/MLP layers have a default mamba cache instantiated, but it stays empty as the layer does not use it
+            if layer_type in ("moe", "mlp"):
                 self.assertEqual(layer.conv_states, None)
                 self.assertEqual(layer.recurrent_states, None)
             # Attention layer cache
@@ -399,7 +399,7 @@ def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_l
                 self.assertEqual(layer.conv_states.shape, conv_shape)
                 self.assertEqual(layer.recurrent_states.shape, recurrent_shape)
             else:
-                raise ValueError("Unknown layer type.")
+                raise ValueError(f"Unknown layer type: {layer_type}")
 
     def setUp(self):
         self.model_tester = NemotronHModelTester(self)
@@ -805,6 +805,128 @@ def test_pattern_conversion_methods(self):
         roundtrip_pattern = NemotronHConfig._list_to_pattern(NemotronHConfig._pattern_to_list(original_pattern))
         self.assertEqual(original_pattern, roundtrip_pattern)
 
+        # Test MLP layer type (dash pattern)
+        pattern_with_mlp = "M-M*"
+        layers = NemotronHConfig._pattern_to_list(pattern_with_mlp)
+        self.assertEqual(layers, ["mamba", "mlp", "mamba", "attention"])
+
+        # Test roundtrip with MLP
+        roundtrip = NemotronHConfig._list_to_pattern(NemotronHConfig._pattern_to_list("M-M-*E"))
+        self.assertEqual(roundtrip, "M-M-*E")
+
+    def test_mlp_layer_type_config(self):
+        """Test that 'mlp' is accepted as a valid layer type in config (regression test for Nemotron-H models
+        that use '-' / 'mlp' standalone layers in their hybrid_override_pattern)."""
+        # Config with mlp layers via layers_block_type list
+        config = NemotronHConfig(
+            vocab_size=100, hidden_size=32, layers_block_type=["mamba", "mlp", "mamba", "attention", "mlp"]
+        )
+        self.assertEqual(config.num_hidden_layers, 5)
+        self.assertEqual(config.layers_block_type[1], "mlp")
+        self.assertEqual(config.layers_block_type[4], "mlp")
+
+        # Config with mlp layers via legacy hybrid_override_pattern (the '-' character)
+        config2 = NemotronHConfig(vocab_size=100, hidden_size=32, hybrid_override_pattern="M-M*-")
+        self.assertEqual(config2.layers_block_type, ["mamba", "mlp", "mamba", "attention", "mlp"])
+        self.assertEqual(config2.hybrid_override_pattern, "M-M*-")
+
+    @require_torch
+    def test_mlp_layer_type_forward(self):
+        """Test that a tiny NemotronH model with MLP layers can run a forward pass (regression test)."""
+        config = NemotronHConfig(
+            vocab_size=99,
+            hidden_size=32,
+            layers_block_type=["mamba", "mlp", "mamba", "attention", "mlp"],
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=32,
+            intermediate_size=40,
+            use_mamba_kernels=False,
+            ssm_state_size=16,
+            mamba_num_heads=8,
+            mamba_n_groups=8,
+            mamba_head_dim=16,
+            mamba_d_conv=4,
+            mamba_expand=2,
+            mamba_chunk_size=64,
+        )
+
+        model = NemotronHModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_ids = ids_tensor([2, 7], config.vocab_size).to(torch_device)
+        with torch.no_grad():
+            result = model(input_ids)
+        self.assertEqual(result.last_hidden_state.shape, (2, 7, 32))
+
+    @require_torch
+    def test_mlp_layer_type_causal_lm(self):
+        """Test that NemotronHForCausalLM with MLP layers can generate tokens (regression test)."""
+        config = NemotronHConfig(
+            vocab_size=99,
+            hidden_size=32,
+            layers_block_type=["mamba", "mlp", "mamba", "attention", "mlp"],
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=32,
+            intermediate_size=40,
+            use_mamba_kernels=False,
+            ssm_state_size=16,
+            mamba_num_heads=8,
+            mamba_n_groups=8,
+            mamba_head_dim=16,
+            mamba_d_conv=4,
+            mamba_expand=2,
+            mamba_chunk_size=64,
+        )
+
+        model = NemotronHForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_ids = ids_tensor([1, 5], config.vocab_size).to(torch_device)
+        with torch.no_grad():
+            output = model.generate(input_ids, max_new_tokens=3, do_sample=False, use_cache=True)
+        # Should have generated 3 new tokens
+        self.assertEqual(output.shape[1], 5 + 3)
+
+    @require_torch
+    def test_mlp_layer_type_nemotron_h_pattern(self):
+        """Test with a pattern resembling real Nemotron-H models (e.g. Nano-4B: M-M-M-MM-M-M*-...)."""
+        # Use a shortened version of the real Nano-4B pattern
+        config = NemotronHConfig(
+            vocab_size=99,
+            hidden_size=32,
+            hybrid_override_pattern="M-M-*M-M",
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            head_dim=32,
+            intermediate_size=40,
+            use_mamba_kernels=False,
+            ssm_state_size=16,
+            mamba_num_heads=8,
+            mamba_n_groups=8,
+            mamba_head_dim=16,
+            mamba_d_conv=4,
+            mamba_expand=2,
+            mamba_chunk_size=64,
+        )
+
+        self.assertEqual(
+            config.layers_block_type,
+            ["mamba", "mlp", "mamba", "mlp", "attention", "mamba", "mlp", "mamba"],
+        )
+
+        model = NemotronHForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        input_ids = ids_tensor([1, 5], config.vocab_size).to(torch_device)
+        with torch.no_grad():
+            result = model(input_ids)
+        self.assertEqual(result.logits.shape, (1, 5, 99))
+
 
 @require_torch
 class NemotronHModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/nomic_bert/test_modeling_nomic_bert.py b/tests/models/nomic_bert/test_modeling_nomic_bert.py
index 389c86b911ee..822f075ded6d 100644
--- a/tests/models/nomic_bert/test_modeling_nomic_bert.py
+++ b/tests/models/nomic_bert/test_modeling_nomic_bert.py
@@ -314,6 +314,20 @@ def test_inference_no_head_absolute_embedding_v1_5(self):
                         ],
                     ]
                 ),
+                ("xpu", None): torch.tensor(
+                    [
+                        [
+                            [1.7039e00, -4.5610e00, 1.5236e00],
+                            [1.8685e00, -3.6936e00, 1.6641e00],
+                            [5.3303e-01, -4.2081e00, 2.3375e00],
+                        ],
+                        [
+                            [2.6867e-03, -3.7496e00, 9.0820e-01],
+                            [1.8297e-02, -3.3884e00, 3.5300e-01],
+                            [-1.4282e-01, -3.6776e00, -3.5079e-01],
+                        ],
+                    ]
+                ),
             }
         ).get_expectation()
         # fmt: on
@@ -353,6 +367,20 @@ def test_inference_no_head_absolute_embedding_v1(self):
                         ]
                     ]
                 ),
+                ("xpu", None): torch.tensor(
+                    [
+                        [
+                            [ 1.2961, -1.1757,  1.2094],
+                            [ 1.1350,  0.5400,  1.4580],
+                            [-0.2897, -0.5351,  2.0092],
+                        ],
+                        [
+                            [-0.2866, -0.9786,  0.8613],
+                            [-0.3104, -0.3421,  0.4867],
+                            [-0.4336, -0.8528, -0.2509],
+                        ]
+                    ]
+                ),
             }
         ).get_expectation()
         # fmt: on
diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
index 6274f26ea605..e93ae070fa90 100644
--- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
@@ -276,13 +276,13 @@ def test_flex_attention_with_grads(self):
 @slow
 class Phi4MultimodalIntegrationTest(unittest.TestCase):
     checkpoint_path = "microsoft/Phi-4-multimodal-instruct"
-    revision = "refs/pr/70"
+    revision = "refs/pr/94"
     image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
     audio_url = "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav"
 
     def setUp(self):
         # Currently, the Phi-4 checkpoint on the hub is not working with the latest Phi-4 code, so the slow integration tests
-        # won't pass without using the correct revision (refs/pr/70)
+        # won't pass without using the correct revision (refs/pr/94)
         self.processor = AutoProcessor.from_pretrained(self.checkpoint_path, revision=self.revision)
         self.generation_config = GenerationConfig(max_new_tokens=20, do_sample=False)
         self.user_token = "<|user|>"
diff --git a/tests/models/phi4_multimodal/test_processing_phi4_multimodal.py b/tests/models/phi4_multimodal/test_processing_phi4_multimodal.py
index 343768c0bb5f..a8c3f0db4db2 100644
--- a/tests/models/phi4_multimodal/test_processing_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_processing_phi4_multimodal.py
@@ -32,7 +32,7 @@
 class Phi4MultimodalProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = Phi4MultimodalProcessor
     checkpoint_path = "microsoft/Phi-4-multimodal-instruct"
-    revision = "refs/pr/70"
+    revision = "refs/pr/94"
     text_input_name = "input_ids"
     images_input_name = "image_pixel_values"
     audio_input_name = "audio_input_features"
diff --git a/tests/models/qianfan_ocr/test_modeling_qianfan_ocr.py b/tests/models/qianfan_ocr/test_modeling_qianfan_ocr.py
index b108f3b0922b..1a101ddc5904 100644
--- a/tests/models/qianfan_ocr/test_modeling_qianfan_ocr.py
+++ b/tests/models/qianfan_ocr/test_modeling_qianfan_ocr.py
@@ -191,6 +191,7 @@ def test_model_integration_forward(self):
             {
                 ("cuda", (8, 6)): torch.tensor([10.1250, 15.8125, 13.0625, 12.3125,  9.4375]),
                 ("cuda", (8, 9)): torch.tensor([10.0625, 15.6875, 13.0000, 12.1875,  9.3750]),
+                ("xpu", None): torch.tensor([10.1875, 15.8750, 13.1875, 12.3750,  9.6250]),
             }
         )  # fmt: skip
         self.assertTrue(
@@ -225,6 +226,7 @@ def test_model_integration_generate(self):
             {
                 ("cuda", (8, 6)): "The image features two striped cats lying down and sleeping on a pink couch. They",
                 ("cuda", (8, 9)): "The image features two striped cats lying down on a pink couch, seemingly asleep.",
+                ("xpu", None): "The image features two striped cats lying down on a couch, both appearing to be",
             }
         )  # fmt: skip
         self.assertEqual(decoded, expected_outputs.get_expectation())
@@ -247,6 +249,7 @@ def test_model_integration_generate_text_only(self):
         expected_outputs = Expectations(
             {
                 ("cuda", None): "1 + 1 equals 2.",
+                ("xpu", None): "1 + 1 equals 2.",
             }
         )  # fmt: skip
         self.assertEqual(decoded, expected_outputs.get_expectation())
@@ -295,12 +298,14 @@ def test_model_integration_batched_generate(self):
         expected_outputs_0 = Expectations(
             {
                 ("cuda", None): "In the tranquil setting of this image, two tabby cats are the stars of",
+                ("xpu", None): "In the tranquil setting of this image, two tabby cats are the stars of",
             }
         )  # fmt: skip
         expected_outputs_1 = Expectations(
             {
                 ("cuda", (8, 6)): "The image features two striped cats lying down and sleeping on a pink couch. The",
                 ("cuda", (8, 9)): "The image features two striped cats lying down on a pink couch, seemingly asleep.",
+                ("xpu", None): "The image features two striped cats lying down on a couch, both appearing to be",
             }
         )  # fmt: skip
         self.assertEqual(decoded_0, expected_outputs_0.get_expectation())
diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
index 5a425b434e7d..2644e4d7444e 100644
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -764,6 +764,9 @@ def test_small_model_integration_test_with_video(self):
                 (None, None): [
                     'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows two individuals playing tennis on an indoor court. The player in the foreground, dressed in a white shirt and black shorts, is preparing to',
                 ],
+                ("rocm", (9, 4)): [
+                    'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows an indoor tennis court with a person standing on the service line, preparing to serve. The individual appears to be practicing or warming up,',
+                ],
                 ("xpu", None): [
                     'system\nYou are a helpful assistant.\nuser\nWhat is shown in this video?\nassistant\nThe video shows an indoor tennis court with a person standing on the service line, preparing to serve. The individual appears to be practicing or warming up,',
                 ],
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index 8776ccdb27dc..8c52fd834278 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -92,6 +92,14 @@ def test_load_balancing_loss(self):
         self.assertEqual(result.router_logits[0].shape, (91, config.num_experts))
         torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(2, dtype=torch.float32), rtol=1e-2, atol=1e-2)
 
+        # Verify router_logits are raw logits, not softmax probabilities (regression test for double-softmax bug)
+        for layer_logits in result.router_logits:
+            row_sums = layer_logits.sum(dim=-1)
+            self.assertFalse(
+                torch.allclose(row_sums, torch.ones_like(row_sums), atol=1e-3),
+                "router_logits should be raw logits (row sums != 1.0), not softmax probabilities",
+            )
+
         # First, we make sure that adding padding tokens doesn't change the loss
         # loss(input_ids, attention_mask=None) == loss(input_ids + padding, attention_mask=attention_mask_with_padding)
         pad_length = input_ids.shape[1] * 4
diff --git a/tests/models/segformer/test_image_processing_segformer.py b/tests/models/segformer/test_image_processing_segformer.py
index 178e8f50529a..d6345ade6f4b 100644
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@@ -15,6 +15,7 @@
 
 import unittest
 
+import numpy as np
 from datasets import load_dataset
 
 from transformers.testing_utils import require_torch, require_vision
@@ -252,6 +253,26 @@ def test_reduce_labels(self):
             encoding = image_processing(image, map, return_tensors="pt")
             self.assertTrue(len(encoding["labels"]) == len(map))
 
+    def test_reduce_labels_keeps_void_label(self):
+        image = np.zeros((2, 2, 3), dtype=np.uint8)
+        segmentation_map = np.array([[0, 1], [2, 255]], dtype=np.uint8)
+        expected_labels = torch.tensor([[[255, 0], [1, 255]]], dtype=torch.long)
+        image_processor_kwargs = self.image_processor_dict.copy()
+        image_processor_kwargs.update(
+            {
+                "do_resize": False,
+                "do_rescale": False,
+                "do_normalize": False,
+                "do_reduce_labels": True,
+            }
+        )
+
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**image_processor_kwargs)
+
+            encoding = image_processing(image, segmentation_map, return_tensors="pt")
+            self.assertTrue(torch.equal(encoding["labels"], expected_labels))
+
     def test_backends_equivalence(self):
         if len(self.image_processing_classes) < 2:
             self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index 539ab98a479b..997736901f3a 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -172,6 +172,30 @@ def test_eager_matches_sdpa_inference(
     ):
         pass
 
+    @unittest.skip(reason="X-CLIP needs batch size to match frames, can't crop and create new dummy inputs")
+    def test_flash_attn_2_inference_equivalence(self):
+        pass
+
+    @unittest.skip(reason="X-CLIP needs batch size to match frames, can't crop and create new dummy inputs")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip(reason="X-CLIP needs batch size to match frames, can't crop and create new dummy inputs")
+    def test_flash_attn_3_inference_equivalence(self):
+        pass
+
+    @unittest.skip(reason="X-CLIP needs batch size to match frames, can't crop and create new dummy inputs")
+    def test_flash_attn_3_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip(reason="X-CLIP needs batch size to match frames, can't crop and create new dummy inputs")
+    def test_flash_attn_4_inference_equivalence(self):
+        pass
+
+    @unittest.skip(reason="X-CLIP needs batch size to match frames, can't crop and create new dummy inputs")
+    def test_flash_attn_4_inference_equivalence_right_padding(self):
+        pass
+
     def test_model_get_set_embeddings(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -561,6 +585,10 @@ def test_model_get_set_embeddings(self):
     def test_feed_forward_chunking(self):
         pass
 
+    @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.")
+    def test_model_parallelism(self):
+        pass
+
     def test_load_vision_text_config(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/quantization/hqq/test_hqq.py b/tests/quantization/hqq/test_hqq.py
index 913bf6bf9e75..ad2797229fa5 100755
--- a/tests/quantization/hqq/test_hqq.py
+++ b/tests/quantization/hqq/test_hqq.py
@@ -14,7 +14,6 @@
 
 import gc
 import unittest
-from unittest import skip
 
 import accelerate
 
@@ -106,7 +105,6 @@ def test_to_dict(self):
 @require_torch_accelerator
 @require_accelerate
 @require_hqq
-@skip("skip for now until we add back support")
 class HQQTest(unittest.TestCase):
     def tearDown(self):
         cleanup()
@@ -164,7 +162,6 @@ def test_quantized_model_fake_weight_dtype(self):
 @require_torch_multi_accelerator
 @require_accelerate
 @require_hqq
-@skip("skip for now until we add back support")
 class HQQTestMultiGPU(unittest.TestCase):
     def tearDown(self):
         cleanup()
@@ -188,7 +185,6 @@ def test_fp16_quantized_model_multipgpu(self):
 @require_torch_accelerator
 @require_accelerate
 @require_hqq
-@skip("skip for now until we add back support")
 class HQQTestBias(unittest.TestCase):
     def tearDown(self):
         cleanup()
@@ -245,7 +241,6 @@ def test_save_and_load_quantized_model(self):
 @require_torch_accelerator
 @require_accelerate
 @require_hqq
-@skip("skip for now until we add back support")
 class HQQSerializationTest(unittest.TestCase):
     def tearDown(self):
         cleanup()
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index bc8f65891445..71832a048f93 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -3361,6 +3361,8 @@ def _get_output_logits(outputs):
                         return outputs.decoder_hidden_states[-1]
                     elif "logits_per_image" in outputs:
                         return outputs.logits_per_image
+                    elif "logits_per_video" in outputs:
+                        return outputs.logits_per_video
                     else:
                         return outputs.logits
 
@@ -3994,8 +3996,9 @@ def flash_attn_from_config(self, attn_implementation: str, test_fwd_in_train: bo
                 self.skipTest(reason=f"At least some parts of this model do not support {attn_implementation}")
 
             # TODO: to change it in the future with other relevant auto classes
+            # deepcopy to avoid mutating the shared config (e.g. _from_config sets dtype on sub-configs)
             fa_model = model_class._from_config(
-                config, attn_implementation=attn_implementation, dtype=torch.bfloat16
+                copy.deepcopy(config), attn_implementation=attn_implementation, dtype=torch.bfloat16
             ).to(torch_device)
 
             # By default, we perform the forward pass in train mode, because it's more sctrict than eval mode. If the
@@ -4762,6 +4765,11 @@ def test_reverse_loading_mapping(self, check_keys_were_modified=True, skip_base_
         config_to_set.mlp_only_layers = [0]  # same but for qwens
         config_to_set.num_dense_layers = 1  # lfm2_moe
 
+        # Precompute state dict keys for every model class to detect dead conversion
+        # rules: a rule skipped for the current class must still apply to at least one.
+        all_classes_model_keys = {
+            cls: list(cls(copy.deepcopy(config)).state_dict().keys()) for cls in self.all_model_classes
+        }
         for model_class in self.all_model_classes:
             if skip_base_model and "For" not in model_class.__name__:
                 continue
@@ -4816,6 +4824,19 @@ def test_reverse_loading_mapping(self, check_keys_were_modified=True, skip_base_
                                 target_pattern_reversed = target_pattern_reversed.replace(r"\1", captured_group)
                             if any(re.search(target_pattern_reversed, k) for k in model.all_tied_weights_keys.keys()):
                                 continue
+
+                            # Skip rules whose target doesn't appear in this model class (e.g. class-specific head rules),
+                            # but assert the rule still matches at least one class
+                            if not any(re.search(target_pattern_reversed, k) for k in model_keys):
+                                self.assertTrue(
+                                    any(
+                                        any(re.search(target_pattern_reversed, k) for k in keys)
+                                        for keys in all_classes_model_keys.values()
+                                    ),
+                                    f"`{target_pattern_reversed}` in `{conversion}` does not match any "
+                                    "model class — the rule may be dead code or incorrectly written.",
+                                )
+                                continue
                         num_matches = sum(re.search(source_pattern, key) is not None for key in serialized_keys)
                         self.assertTrue(
                             num_matches > 0,
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 1bf52f0369dd..cd9b1d737b53 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -2018,6 +2018,21 @@ def test_apply_chat_template_tool_calls_no_content(self):
         result = processor.apply_chat_template(messages, tokenize=True)
         self.assertIsInstance(result, list)
 
+        # Also test with explicit content=None (OpenAI returns this for tool-call-only messages)
+        messages_with_none = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": "What is the weather?"}],
+            },
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": {}}}],
+            },
+        ]
+        result_none = processor.apply_chat_template(messages_with_none, tokenize=True)
+        self.assertIsInstance(result_none, list)
+
     def test_get_num_multimodal_tokens_matches_processor_call(self):
         "Tests that the helper used internally in vLLM works correctly"
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 833134c2913f..56f32fc44a3b 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -1086,6 +1086,33 @@ def test_chat_template_batched(self):
             dummy_conversations, chat_template=dummy_template, tokenize=True
         )  # Check that no error raised
 
+    @require_jinja
+    def test_chat_template_content_none(self):
+        """Regression test: content=None (e.g. OpenAI tool-call messages) should be treated the same as missing content."""
+        dummy_template = (
+            "{% for message in messages %}"
+            "{{ message['role'] }}"
+            "{% if message.content is defined %}: {{ message['content'] }}{% endif %}"
+            "\n"
+            "{% endfor %}"
+        )
+        messages_with_none = [
+            {"role": "user", "content": "What is the weather?"},
+            {"role": "assistant", "content": None},
+        ]
+        messages_without_content = [
+            {"role": "user", "content": "What is the weather?"},
+            {"role": "assistant"},
+        ]
+        tokenizer = self.get_tokenizer()
+        output_none = tokenizer.apply_chat_template(
+            messages_with_none, chat_template=dummy_template, tokenize=False, return_dict=False
+        )
+        output_missing = tokenizer.apply_chat_template(
+            messages_without_content, chat_template=dummy_template, tokenize=False, return_dict=False
+        )
+        self.assertEqual(output_none, output_missing)
+
     @require_jinja
     def test_jinja_loopcontrols(self):
         break_template = """
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index a27ced73018f..50b9f8e325e1 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -16,7 +16,7 @@
 
 import pytest
 
-from transformers import DetrConfig, MaskFormerConfig, PreTrainedConfig, ResNetBackbone, ResNetConfig, TimmBackbone
+from transformers import MaskFormerConfig, PreTrainedConfig, ResNetBackbone, ResNetConfig, TimmBackbone
 from transformers.backbone_utils import (
     BackboneConfigMixin,
     BackboneMixin,
@@ -162,7 +162,7 @@ def test_load_backbone_from_config(self):
         config = MaskFormerConfig(backbone_config=ResNetConfig(out_indices=(0, 2)))
         backbone = load_backbone(config)
         self.assertEqual(backbone.out_features, ["stem", "stage2"])
-        self.assertEqual(backbone.out_indices, (0, 2))
+        self.assertEqual(backbone.out_indices, [0, 2])
         self.assertIsInstance(backbone, ResNetBackbone)
 
     @slow
@@ -239,7 +239,7 @@ def get_equal_not_equal_weights(model_0, model_1):
                     not_equal_weights.append(k0)
             return equal_weights, not_equal_weights
 
-        config = MaskFormerConfig(use_pretrained_backbone=False, backbone="microsoft/resnet-18")
+        config = MaskFormerConfig(backbone="microsoft/resnet-18")
         model_0 = NewModel(config)
         model_1 = NewModel(config)
         equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
@@ -249,7 +249,7 @@ def get_equal_not_equal_weights(model_0, model_1):
         self.assertEqual(len(equal_weights), 0)
         self.assertEqual(len(not_equal_weights), 24)
 
-        # Now we create a new model with backbone weights that are pretrained
+        # Setting use_pretrained_backbone has no effect on load_backbone
         config.use_pretrained_backbone = True
         model_0 = NewModel(config)
         model_1 = NewModel(config)
@@ -257,29 +257,5 @@ def get_equal_not_equal_weights(model_0, model_1):
 
         # Norm layers are always initialized with the same weights
         equal_weights = [w for w in equal_weights if "normalization" not in w]
-        self.assertEqual(len(equal_weights), 20)
-        # Linear layers are still initialized randomly
-        self.assertEqual(len(not_equal_weights), 4)
-
-        # Check loading in timm backbone
-        config = DetrConfig(use_pretrained_backbone=False, backbone="resnet18", use_timm_backbone=True)
-        model_0 = NewModel(config)
-        model_1 = NewModel(config)
-        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
-
-        # Norm layers are always initialized with the same weights
-        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
         self.assertEqual(len(equal_weights), 0)
         self.assertEqual(len(not_equal_weights), 24)
-
-        # Now we create a new model with backbone weights that are pretrained
-        config.use_pretrained_backbone = True
-        model_0 = NewModel(config)
-        model_1 = NewModel(config)
-        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
-
-        # Norm layers are always initialized with the same weights
-        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
-        self.assertEqual(len(equal_weights), 20)
-        # Linear layers are still initialized randomly
-        self.assertEqual(len(not_equal_weights), 4)
diff --git a/tests/utils/test_dynamic_module_utils.py b/tests/utils/test_dynamic_module_utils.py
index dfdc63460cd3..ec172748ddc6 100644
--- a/tests/utils/test_dynamic_module_utils.py
+++ b/tests/utils/test_dynamic_module_utils.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 
 import os
+from pathlib import Path
 
 import pytest
 
-from transformers.dynamic_module_utils import get_imports
+from transformers import dynamic_module_utils
+from transformers.dynamic_module_utils import get_cached_module_file, get_imports
 
 
 TOP_LEVEL_IMPORT = """
@@ -127,3 +129,53 @@ def test_import_parsing(tmp_path, case):
 
     parsed_imports = get_imports(tmp_file_path)
     assert parsed_imports == ["os"]
+
+
+def _create_local_module(module_dir: Path, module_code: str, helper_code: str | None = None):
+    module_dir.mkdir(parents=True, exist_ok=True)
+    (module_dir / "custom_model.py").write_text(module_code, encoding="utf-8")
+    if helper_code is not None:
+        (module_dir / "helper.py").write_text(helper_code, encoding="utf-8")
+
+
+def test_get_cached_module_file_local_cache_key_uses_content_hash(monkeypatch, tmp_path):
+    modules_cache = tmp_path / "hf_modules_cache"
+    monkeypatch.setattr(dynamic_module_utils, "HF_MODULES_CACHE", str(modules_cache))
+
+    model_dir_a = tmp_path / "pretrained_a" / "subdir"
+    model_dir_b = tmp_path / "pretrained_b" / "subdir"
+    model_dir_c = tmp_path / "pretrained_c" / "subdir"
+
+    _create_local_module(model_dir_a, 'MAGIC = "A"\n')
+    _create_local_module(model_dir_b, 'MAGIC = "B"\n')
+    _create_local_module(model_dir_c, 'MAGIC = "A"\n')
+
+    cached_module_a = get_cached_module_file(str(model_dir_a), "custom_model.py")
+    cached_module_b = get_cached_module_file(str(model_dir_b), "custom_model.py")
+    cached_module_c = get_cached_module_file(str(model_dir_c), "custom_model.py")
+
+    assert Path(cached_module_a).parent.name != "subdir"
+    assert cached_module_a != cached_module_b
+    assert cached_module_a == cached_module_c
+
+
+def test_get_cached_module_file_local_cache_key_includes_relative_import_sources(monkeypatch, tmp_path):
+    modules_cache = tmp_path / "hf_modules_cache"
+    monkeypatch.setattr(dynamic_module_utils, "HF_MODULES_CACHE", str(modules_cache))
+
+    model_dir_a = tmp_path / "pretrained_a" / "subdir"
+    model_dir_b = tmp_path / "pretrained_b" / "subdir"
+
+    module_code = "from .helper import MAGIC\nVALUE = MAGIC\n"
+    _create_local_module(model_dir_a, module_code, 'MAGIC = "A"\n')
+    _create_local_module(model_dir_b, module_code, 'MAGIC = "B"\n')
+
+    cached_module_a = get_cached_module_file(str(model_dir_a), "custom_model.py")
+    cached_module_b = get_cached_module_file(str(model_dir_b), "custom_model.py")
+
+    cached_helper_a = modules_cache / Path(cached_module_a).parent / "helper.py"
+    cached_helper_b = modules_cache / Path(cached_module_b).parent / "helper.py"
+
+    assert cached_module_a != cached_module_b
+    assert cached_helper_a.read_text(encoding="utf-8") == 'MAGIC = "A"\n'
+    assert cached_helper_b.read_text(encoding="utf-8") == 'MAGIC = "B"\n'
diff --git a/tests/utils/test_import_structure.py b/tests/utils/test_import_structure.py
index fb48d35d5248..70b8f28eb2b9 100644
--- a/tests/utils/test_import_structure.py
+++ b/tests/utils/test_import_structure.py
@@ -192,6 +192,30 @@ def test_import_spread(self):
 
         self.assertEqual(ground_truth_spread_import_structure, newly_spread_import_structure)
 
+    def test_pil_import_structure_does_not_require_torchvision(self):
+        import_structure = spread_import_structure(define_import_structure(self.models_path / "gemma3"))
+
+        module_name = "image_processing_pil_gemma3"
+        object_name = "Gemma3ImageProcessorPil"
+        matching_backends = []
+
+        for backends, modules in import_structure.items():
+            if module_name in modules and object_name in modules[module_name]:
+                matching_backends.append(backends)
+
+        self.assertTrue(
+            matching_backends,
+            f"Could not find `{object_name}` in the import structure for `{module_name}`.",
+        )
+        self.assertTrue(
+            any("torchvision" not in backends for backends in matching_backends),
+            f"`{object_name}` should be importable without torchvision: {matching_backends}",
+        )
+        self.assertFalse(
+            any("torchvision" in backends for backends in matching_backends),
+            f"`{object_name}` should not require torchvision: {matching_backends}",
+        )
+
 
 @pytest.mark.parametrize(
     "backend,package_name,version_comparison,version",
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index fab48f9ddb8a..ce2e2442bcc4 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -1602,6 +1602,28 @@ def test_tied_weights_are_always_tied_from_config(self):
             model = LlamaForCausalLM._from_config(copy.deepcopy(config))
             self.assertTrue(model.lm_head.weight is not model.model.embed_tokens.weight)
 
+    def test_save_pretrained_auto_fixes_diverged_tied_embeddings(self):
+        """Test that save_pretrained sets tie_word_embeddings=False in config when weights have diverged."""
+        config = LlamaConfig(num_hidden_layers=2, hidden_size=32, intermediate_size=16, tie_word_embeddings=True)
+        model = LlamaForCausalLM(config)
+
+        # Simulate PEFT merge_and_unload: untie weights and assign different values
+        with torch.no_grad():
+            model.lm_head.weight = nn.Parameter(model.lm_head.weight.clone())
+            model.lm_head.weight.fill_(0.42)
+            model.model.embed_tokens.weight.fill_(0.24)
+
+        logger = logging.get_logger("transformers.modeling_utils")
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with CaptureLogger(logger) as cl:
+                model.save_pretrained(tmp_dir)
+
+            self.assertIn("weights have diverged. Saving config with `tie_word_embeddings=False`", cl.out)
+
+            with open(os.path.join(tmp_dir, "config.json")) as f:
+                saved_config = json.load(f)
+            self.assertFalse(saved_config["tie_word_embeddings"])
+
     def test_unexpected_keys_warnings(self):
         model = ModelWithHead(PreTrainedConfig(tie_word_embeddings=True))
         logger = logging.get_logger("transformers.modeling_utils")
diff --git a/tests/utils/test_testing_utils.py b/tests/utils/test_testing_utils.py
new file mode 100644
index 000000000000..80b06f37159e
--- /dev/null
+++ b/tests/utils/test_testing_utils.py
@@ -0,0 +1,86 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from unittest import mock
+
+from transformers import testing_utils
+
+
+class PatchedTestingMethodsOutputFileTest(unittest.TestCase):
+    def test_get_output_file_without_xdist_worker(self):
+        with (
+            tempfile.TemporaryDirectory() as tmpdir,
+            mock.patch.dict(os.environ, {"_PATCHED_TESTING_METHODS_OUTPUT_DIR": tmpdir}, clear=True),
+        ):
+            output_path = testing_utils._get_patched_testing_methods_output_file()
+
+        self.assertEqual(output_path, Path(tmpdir) / "captured_info.txt")
+
+    def test_get_output_file_with_xdist_worker(self):
+        with (
+            tempfile.TemporaryDirectory() as tmpdir,
+            mock.patch.dict(
+                os.environ,
+                {
+                    "_PATCHED_TESTING_METHODS_OUTPUT_DIR": tmpdir,
+                    "PYTEST_XDIST_WORKER": "gw2",
+                },
+                clear=True,
+            ),
+        ):
+            output_path = testing_utils._get_patched_testing_methods_output_file()
+
+        self.assertEqual(output_path, Path(tmpdir) / "captured_info_gw2.txt")
+
+    def test_prepare_debugging_info_writes_worker_specific_file(self):
+        with (
+            tempfile.TemporaryDirectory() as tmpdir,
+            mock.patch.dict(
+                os.environ,
+                {
+                    "_PATCHED_TESTING_METHODS_OUTPUT_DIR": tmpdir,
+                    "PYTEST_XDIST_WORKER": "gw1",
+                },
+                clear=True,
+            ),
+        ):
+            output_path = Path(tmpdir) / "captured_info_gw1.txt"
+            rendered_info = testing_utils._prepare_debugging_info("test-info", "payload")
+            self.assertEqual(rendered_info, "test-info\n\npayload")
+            self.assertTrue(output_path.exists())
+            self.assertIn("test-info\n\npayload", output_path.read_text())
+
+    def test_reset_only_clears_current_worker_file(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            current_worker_path = Path(tmpdir) / "captured_info_gw0.txt"
+            other_worker_path = Path(tmpdir) / "captured_info_gw1.txt"
+            current_worker_path.write_text("current worker")
+            other_worker_path.write_text("other worker")
+
+            with mock.patch.dict(
+                os.environ,
+                {
+                    "_PATCHED_TESTING_METHODS_OUTPUT_DIR": tmpdir,
+                    "PYTEST_XDIST_WORKER": "gw0",
+                },
+                clear=True,
+            ):
+                output_path = testing_utils._reset_patched_testing_methods_output_file()
+                self.assertEqual(output_path, current_worker_path)
+                self.assertFalse(current_worker_path.exists())
+                self.assertTrue(other_worker_path.exists())
diff --git a/utils/check_config_docstrings.py b/utils/check_config_docstrings.py
index c69e977d8d7a..bfebc7732631 100644
--- a/utils/check_config_docstrings.py
+++ b/utils/check_config_docstrings.py
@@ -86,8 +86,8 @@ def check_config_docstrings_have_checkpoints():
         raise ValueError(
             f"The following configurations don't contain any valid checkpoint:\n{message}\n\n"
             "The requirement is to include a link pointing to one of the models of this architecture in the "
-            "docstring of the config classes listed above. The link should be passed to an `auto_docstring`"
-            "decorator as follows `@auto_docstring(checkpoint='myorg/mymodel')."
+            "docstring of the config classes listed above. The link should be passed to an `auto_docstring` "
+            "decorator as follows `@auto_docstring(checkpoint='myorg/mymodel')`."
         )