diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
index e96c7ef16a07..94f6dece6bc2 100644
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@@ -186,7 +186,18 @@ jobs:
         env:
           report_name_prefix: ${{ inputs.report_name_prefix }}
         run: |
-          cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/captured_info.txt"
+          shopt -s nullglob
+          captured_info_files=("/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"/captured_info*.txt)
+
+          if [ ${#captured_info_files[@]} -eq 0 ]; then
+            echo "No captured information files found."
+            exit 0
+          fi
+
+          for captured_info_file in "${captured_info_files[@]}"; do
+            echo "===== ${captured_info_file##*/} ====="
+            cat "$captured_info_file"
+          done
 
       - name: Copy test_outputs.txt
         if: ${{ always() }}
diff --git a/all_requirements.txt b/all_requirements.txt
new file mode 100644
index 000000000000..eacb47727a64
--- /dev/null
+++ b/all_requirements.txt
@@ -0,0 +1,98 @@
+gpustat==1.1.1
+psutil==6.0.0
+psycopg2==2.9.9
+pandas>=1.5.0
+numpy>=1.21.0
+psutil>=5.8.0
+nvidia-ml-py>=12.0.0
+torch>=2.0.0
+datasets>=2.10.0
+huggingface_hub>=0.16.0
+amdsmi>=7.0.2
+git+https://github.com/huggingface/transformers.git@main # install main or adjust it with vX.X.X for installing version specific transforms
+datasets==1.8.0accelerate >= 0.12.0
+datasets >= 1.8.0
+torch >= 1.3.0
+evaluateaccelerate >= 0.21.0
+sentencepiece != 0.1.92
+protobuf
+torch >= 1.3
+datasets[audio]>=1.14.0
+evaluate
+librosa
+torchaudio
+torch>=1.6
+accelerate >= 0.12.0
+datasets >= 1.8.0
+sentencepiece != 0.1.92
+protobuf
+sacrebleu >= 1.4.12
+py7zr
+torch >= 1.3
+evaluatedatasets >= 2.0.0
+torch >= 1.3
+accelerate
+evaluate
+Pillow
+albumentations >= 1.4.16
+accelerate >= 0.12.0
+datasets >= 1.8.0
+sentencepiece != 0.1.92
+protobuf
+rouge-score
+nltk
+py7zr
+torch >= 1.3
+evaluate
+torch>=1.5.0
+torchvision>=0.6.0
+datasets>=1.8.0accelerate >= 0.12.0
+datasets >= 1.8.0
+sentencepiece != 0.1.92
+scipy
+scikit-learn
+protobuf
+torch >= 1.3
+evaluateaccelerate>=0.12.0
+torch>=1.5.0
+torchvision>=0.6.0
+datasets>=2.14.0
+evaluate
+scikit-learnaccelerate >= 0.12.0
+torch >= 1.3
+datasets >= 2.14.0
+sentencepiece != 0.1.92
+protobuf
+evaluate
+scikit-learn
+accelerate >= 0.12.0
+seqeval
+datasets >= 1.8.0
+torch >= 1.3
+evaluatealbumentations >= 1.4.16
+timm
+datasets>=4.0
+torchmetrics
+pycocotools
+datasets[audio] >= 1.18.0
+torch >= 1.5
+torchaudio
+librosa
+jiwer
+evaluate
+datasets[audio] >= 1.12.0
+torch >= 1.5
+torchaudio
+accelerate >= 0.12.0
+librosatorch>=1.5.0
+torchvision>=0.6.0
+datasets>=1.8.0albumentations >= 1.4.16
+timm
+datasets
+torchmetrics
+pycocotools
+accelerate >= 0.12.0
+sentencepiece != 0.1.92
+protobuf
+torch >= 1.3
+evaluate
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index bd3de7b27311..ee71c087dde2 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -18,9 +18,20 @@ ARG TORCHCODEC='0.11.0'
 
 ARG FLASH_ATTN='false'
 
+# 'x86_64' or 'arm64'
+ARG ARCHITECTURE='x86_64'
+
 RUN apt update
-RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs curl
 RUN git lfs install
+
+RUN set-e; \
+if [ "$ARCHITECTURE" = "arm64" ]; then \
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y;\
+    PATH="/root/.cargo/bin:${PATH}";\
+    rustc --version;\
+fi;
+
 RUN python3 -m pip install --no-cache-dir --upgrade pip
 
 ARG REF=main
@@ -36,7 +47,11 @@ RUN set -e; \
     # Determine torch version
     if [ ${#PYTORCH} -gt 0 ] && [ "$PYTORCH" != "pre" ]; then \
         VERSION="torch==${PYTORCH}.*"; \
-        TORCHCODEC_VERSION="torchcodec==${TORCHCODEC}.*"; \
+        if [ "$ARCHITECTURE" = "arm64" ]; then \
+            TORCHCODEC_VERSION="torchcodec"; \
+        else \
+            TORCHCODEC_VERSION="torchcodec==${TORCHCODEC}.*"; \
+        fi; \
     else \
         VERSION="torch"; \
         TORCHCODEC_VERSION="torchcodec"; \
diff --git a/docs/source/en/auto_docstring.md b/docs/source/en/auto_docstring.md
index 5426af13fa31..1b55f0fcc5d1 100644
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@@ -134,11 +134,11 @@ class MyModelConfig(PreTrainedConfig):
         Description of another model-specific parameter.
 
     ```python
-    >>> from transformers import MyModelConfig, MyModel
+    from transformers import MyModelConfig, MyModel
 
-    >>> configuration = MyModelConfig()
-    >>> model = MyModel(configuration)
-    >>> configuration = model.config
+    configuration = MyModelConfig()
+    model = MyModel(configuration)
+    configuration = model.config
     ```
     """
 
diff --git a/docs/source/en/internal/import_utils.md b/docs/source/en/internal/import_utils.md
index 41ee64f1611c..abb85008d53e 100644
--- a/docs/source/en/internal/import_utils.md
+++ b/docs/source/en/internal/import_utils.md
@@ -29,18 +29,24 @@ This object is still importable:
 
 ```python
 >>> from transformers import DetrImageProcessor
->>> print(DetrImageProcessor)
-<class 'DetrImageProcessor'>
+>>> print(DetrImageProcessor)  # doctest: +ELLIPSIS
+<class '...DetrImageProcessor'>
 ```
 
 However, no method can be called on that object:
 
 ```python
+>>> from transformers.utils.import_utils import BACKENDS_MAPPING, DummyObject
+>>> _torchvision_backend = BACKENDS_MAPPING["torchvision"]
+>>> BACKENDS_MAPPING["torchvision"] = (lambda: False, _torchvision_backend[1].lstrip("\n"))
+>>> DetrImageProcessor = DummyObject("DetrImageProcessor", (), {"_backends": ["torchvision"]})
 >>> DetrImageProcessor.from_pretrained()
-ImportError:
-DetrImageProcessor requires the Torchvision library but it was not found in your environment. Check out the instructions on the
+Traceback (most recent call last):
+...
+ImportError: DetrImageProcessor requires the Torchvision library but it was not found in your environment. Check out the instructions on the
 installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
 Please note that you may need to restart your runtime after installation.
+>>> BACKENDS_MAPPING["torchvision"] = _torchvision_backend
 ```
 
 Let's see how to specify specific object dependencies.
diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md
index faca097d1160..16f20999a954 100644
--- a/docs/source/en/main_classes/pipelines.md
+++ b/docs/source/en/main_classes/pipelines.md
@@ -34,6 +34,7 @@ pipeline but can provide additional quality of life.
 Simple call on one item:
 
 ```python
+>>> from transformers import pipeline
 >>> pipe = pipeline("text-classification")
 >>> pipe("This restaurant is awesome")
 [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
diff --git a/docs/source/en/model_doc/pe_audio_video.md b/docs/source/en/model_doc/pe_audio_video.md
index e116724d43f5..af0db76537f5 100644
--- a/docs/source/en/model_doc/pe_audio_video.md
+++ b/docs/source/en/model_doc/pe_audio_video.md
@@ -26,7 +26,47 @@ TODO
 ### Basic usage
 
 ```py
-TODO
+
+model = PeAudioVideoModel.from_pretrained("facebook/pe-av-large", device_map="cuda", dtype=torch.bfloat16)
+processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-large")
+
+from huggingface_hub import hf_hub_download
+
+video_path = hf_hub_download(
+    repo_id="eustlb/dummy-video-dataset", filename="audiobox.mp4", repo_type="dataset"
+)
+
+video_path2 = hf_hub_download(
+    repo_id="eustlb/dummy-video-dataset", filename="glass_breaking.mp4", repo_type="dataset"
+)
+
+audio_path = hf_hub_download(
+    repo_id="eustlb/dummy-video-dataset", filename="audiobox.mp4", repo_type="dataset"
+)
+
+audio_path2 = hf_hub_download(
+    repo_id="eustlb/dummy-video-dataset", filename="glass_breaking.mp4", repo_type="dataset"
+)
+
+video_files = [video_path, video_path2]
+descriptions = ["A woman and a man speaking", "A glass breaking"]
+audio_files = [audio_path, audio_path2]
+
+inputs = processor(
+    videos=video_files, text=descriptions, audio=audio_files, return_tensors="pt", padding=True
+)
+
+with torch.inference_mode(), torch.autocast(model.device.type, dtype=torch.bfloat16):
+    outputs = model(**inputs.to(model.device, dtype=model.dtype))
+
+audio_embeds = outputs.audio_embeds  # Audio-only embeddings
+video_embeds = outputs.video_embeds  # Video-only embeddings
+audio_video_embeds = outputs.audio_video_embeds  # Joint audio-video embeddings
+text_audio_embeds = outputs.text_audio_embeds  # Text embeddings aligned to audio
+text_video_embeds = outputs.text_video_embeds  # Text embeddings aligned to video
+text_audio_video_embeds = outputs.text_audio_video_embeds  # Text embeddings aligned to audio-video
+audio_plus_text_embeds = outputs.audio_plus_text_embeds  # Joint audio and text embedding
+video_plus_text_embeds = outputs.video_plus_text_embeds  # Joint video and text embedding
 ```
 
 ## PeAudioVideoProcessor
diff --git a/docs/source/en/model_doc/qwen3_5.md b/docs/source/en/model_doc/qwen3_5.md
index 1d542dd918ce..aae67e8a8e7a 100644
--- a/docs/source/en/model_doc/qwen3_5.md
+++ b/docs/source/en/model_doc/qwen3_5.md
@@ -70,14 +70,19 @@ TODO
 [[autodoc]] Qwen3_5ForCausalLM
     - forward
 
+## Qwen3_5ForConditionalGeneration
+
+[[autodoc]] Qwen3_5ForConditionalGeneration
+    - forward
+
 ## Qwen3_5ForSequenceClassification
 
 [[autodoc]] Qwen3_5ForSequenceClassification
     - forward
 
-## Qwen3_5ForConditionalGeneration
+## Qwen3_5TextForSequenceClassification
 
-[[autodoc]] Qwen3_5ForConditionalGeneration
+[[autodoc]] Qwen3_5TextForSequenceClassification
     - forward
 
 ## Qwen3_5Tokenizer
diff --git a/docs/source/en/tasks/zero_shot_object_detection.md b/docs/source/en/tasks/zero_shot_object_detection.md
index 8a5506939898..aa15ff46f05d 100644
--- a/docs/source/en/tasks/zero_shot_object_detection.md
+++ b/docs/source/en/tasks/zero_shot_object_detection.md
@@ -168,8 +168,7 @@ boxes have the correct coordinates relative to the original image:
 ...     outputs = model(**inputs)
 
 >>> results = processor.post_process_grounded_object_detection(
-...    outputs, threshold=0.50, target_sizes=[(image.height, image.width)], text_labels=text_labels,
-... )[0]
+...    outputs, threshold=0.50, target_sizes=[(image.height, image.width)], text_labels=text_labels)[0]
 
 >>> draw = ImageDraw.Draw(image)
 
diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py
index ff6e666a804e..a207b5d32e0f 100644
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@@ -160,10 +160,8 @@ def create_causal_mask_mapping(
     # from `forward` call. If users run a `forward` call, we have no option to infer `is_first_iteration` because users may be
     # running generation with custom loop. Thus we need to infer it in a `non-perfect` way
     # NOTE: Determining prefill in that case requires checking data values, which is not compile-compatible.
-    is_first_iteration = (
-        is_first_iteration
-        if is_first_iteration
-        else (past_key_values is None or not past_key_values.is_initialized or pixel_values is not None)
+    is_first_iteration = is_first_iteration or (
+        past_key_values is None or not past_key_values.is_initialized or pixel_values is not None
     )
 
     if is_first_iteration or not kwargs.get("use_cache", True):
@@ -256,9 +254,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index f6d13078bbc6..5e539047a6b9 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -633,7 +633,7 @@ def preprocess_images(examples):
     )
 
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.TPU:
+    if accelerator.distributed_type == DistributedType.XLA:
         model.tie_weights()
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 0f8d2cd0d6e3..d91340a3afe4 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -553,7 +553,7 @@ def group_texts(examples):
     )
 
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.TPU:
+    if accelerator.distributed_type == DistributedType.XLA:
         model.tie_weights()
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
@@ -627,6 +627,7 @@ def group_texts(examples):
         model.train()
         if args.with_tracking:
             total_loss = 0
+            total_samples = 0
         if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
             # We skip the first `n` batches in the dataloader when resuming from a checkpoint
             active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
@@ -638,7 +639,9 @@ def group_texts(examples):
                 loss = outputs.loss
                 # We keep track of the loss at each epoch
                 if args.with_tracking:
-                    total_loss += loss.detach().float()
+                    batch_size = batch["input_ids"].shape[0]
+                    total_loss += loss.detach().float() * batch_size
+                    total_samples += batch_size
                 accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
@@ -665,7 +668,8 @@ def group_texts(examples):
                 outputs = model(**batch)
 
             loss = outputs.loss
-            losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
+            batch_size = batch["input_ids"].shape[0]
+            losses.append(accelerator.gather_for_metrics(loss.repeat(batch_size)))
 
         losses = torch.cat(losses)
         try:
@@ -681,7 +685,7 @@ def group_texts(examples):
                 {
                     "perplexity": perplexity,
                     "eval_loss": eval_loss,
-                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "train_loss": total_loss.item() / total_samples,
                     "epoch": epoch,
                     "step": completed_steps,
                 },
diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py
index 962e497b72e0..a0c0ff8b7da0 100644
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@@ -743,7 +743,7 @@ def apply_fim(examples):
     )
 
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.TPU:
+    if accelerator.distributed_type == DistributedType.XLA:
         model.tie_weights()
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
@@ -817,6 +817,7 @@ def apply_fim(examples):
         model.train()
         if args.with_tracking:
             total_loss = 0
+            total_samples = 0
         if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
             # We skip the first `n` batches in the dataloader when resuming from a checkpoint
             active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
@@ -828,7 +829,9 @@ def apply_fim(examples):
                 loss = outputs.loss
                 # We keep track of the loss at each epoch
                 if args.with_tracking:
-                    total_loss += loss.detach().float()
+                    batch_size = batch["input_ids"].shape[0]
+                    total_loss += loss.detach().float() * batch_size
+                    total_samples += batch_size
                 accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
@@ -855,7 +858,8 @@ def apply_fim(examples):
                 outputs = model(**batch)
 
             loss = outputs.loss
-            losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
+            batch_size = batch["input_ids"].shape[0]
+            losses.append(accelerator.gather_for_metrics(loss.repeat(batch_size)))
 
         losses = torch.cat(losses)
         try:
@@ -871,7 +875,7 @@ def apply_fim(examples):
                 {
                     "perplexity": perplexity,
                     "eval_loss": eval_loss,
-                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "train_loss": total_loss.item() / total_samples,
                     "epoch": epoch,
                     "step": completed_steps,
                 },
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 981a496badad..a4ed188c0fa1 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -582,7 +582,7 @@ def group_texts(examples):
     )
 
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.TPU:
+    if accelerator.distributed_type == DistributedType.XLA:
         model.tie_weights()
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
@@ -656,6 +656,7 @@ def group_texts(examples):
         model.train()
         if args.with_tracking:
             total_loss = 0
+            total_samples = 0
         if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
             # We skip the first `n` batches in the dataloader when resuming from a checkpoint
             active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
@@ -667,7 +668,9 @@ def group_texts(examples):
                 loss = outputs.loss
                 # We keep track of the loss at each epoch
                 if args.with_tracking:
-                    total_loss += loss.detach().float()
+                    batch_size = batch["input_ids"].shape[0]
+                    total_loss += loss.detach().float() * batch_size
+                    total_samples += batch_size
                 accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
@@ -695,7 +698,8 @@ def group_texts(examples):
                 outputs = model(**batch)
 
             loss = outputs.loss
-            losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
+            batch_size = batch["input_ids"].shape[0]
+            losses.append(accelerator.gather_for_metrics(loss.repeat(batch_size)))
 
         losses = torch.cat(losses)
         try:
@@ -711,7 +715,7 @@ def group_texts(examples):
                 {
                     "perplexity": perplexity,
                     "eval_loss": eval_loss,
-                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "train_loss": total_loss.item() / total_samples,
                     "epoch": epoch,
                     "step": completed_steps,
                 },
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index 457ccc9001bf..573adbe46c81 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -412,8 +412,9 @@ def main():
 
     # Trying to have good defaults here, don't hesitate to tweak to your needs.
 
+    label_feature = raw_datasets["train"].features["label"]
     is_regression = (
-        raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
+        getattr(label_feature, "dtype", None) in ["float32", "float64"]
         if data_args.do_regression is None
         else data_args.do_regression
     )
@@ -439,7 +440,7 @@ def main():
                     raise error
 
     else:  # classification
-        if raw_datasets["train"].features["label"].dtype == "list":  # multi-label classification
+        if isinstance(raw_datasets["train"].features["label"], datasets.Sequence):  # multi-label classification
             is_multi_label = True
             logger.info("Label type is list, doing multi-label classification")
         # Trying to find the number of labels in a multi-label classification task
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index a705bc94a7f3..6fb8a786dc27 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -446,6 +446,9 @@ def main():
         )
         model.config.forced_bos_token_id = forced_bos_token_id
 
+        if hasattr(model, "generation_config") and model.generation_config is not None:
+            model.generation_config.forced_bos_token_id = forced_bos_token_id
+
     # Get the language codes for input/target.
     source_lang = data_args.source_lang.split("_")[0]
     target_lang = data_args.target_lang.split("_")[0]
diff --git a/scripts/check_tokenizers.py b/scripts/check_tokenizers.py
index 93d7fb5afdc6..cd136a67124c 100644
--- a/scripts/check_tokenizers.py
+++ b/scripts/check_tokenizers.py
@@ -10,37 +10,27 @@
 
 logging.set_verbosity_info()
 
+# Mapping of slow -> fast tokenizer classes
 TOKENIZER_CLASSES = {
     name: (getattr(transformers, name), getattr(transformers, name + "Fast")) for name in SLOW_TO_FAST_CONVERTERS
 }
 
-dataset = datasets.load_dataset("facebook/xnli", split="test+validation")  # no-script
+# Load a small subset of XNLI (English) for safe testing else all_languages and test+validation
+dataset = datasets.load_dataset("facebook/xnli", "en", split="test+validation[:10]")
 
-total = 0
-perfect = 0
-imperfect = 0
-wrong = 0
+total = perfect = imperfect = wrong = 0
 
 
 def check_diff(
     spm_diff: list[int], tok_diff: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
 ) -> bool:
     if spm_diff == list(reversed(tok_diff)):
-        # AAA -> AA+A vs A+AA case.
         return True
     elif len(spm_diff) == len(tok_diff) and fast.decode(spm_diff) == fast.decode(tok_diff):
-        # Second order OK
-        # Barrich -> Barr + ich vs Bar + rich
         return True
     spm_reencoded = slow.encode(slow.decode(spm_diff))
     tok_reencoded = fast.encode(fast.decode(spm_diff))
     if spm_reencoded != spm_diff and spm_reencoded == tok_reencoded:
-        # Type 3 error.
-        # Snehagatha ->
-        #       Sne, h, aga, th, a
-        #       Sne, ha, gat, ha
-        # Encoding the wrong with sp does not even recover what spm gave us
-        # It fits tokenizer however...
         return True
     return False
 
@@ -59,8 +49,6 @@ def check_LTR_mark(line: str, idx: int, fast: PreTrainedTokenizerBase) -> bool:
 def check_details(
     line: str, spm_ids: list[int], tok_ids: list[int], slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase
 ) -> bool:
-    # Encoding can be the same with same result AAA -> A + AA vs AA + A
-    # We can check that we use at least exactly the same number of tokens.
     for i, (spm_id, tok_id) in enumerate(zip(spm_ids, tok_ids)):
         if spm_id != tok_id:
             break
@@ -80,11 +68,9 @@ def check_details(
         return True
 
     if last - first > 5:
-        # We might have twice a single problem, attempt to subdivide the disjointed tokens into smaller problems
         spms = Counter(spm_ids[first:last])
         toks = Counter(tok_ids[first:last])
-
-        removable_tokens = {spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si}
+        removable_tokens = {spm_ for spm_, si in spms.items() if toks.get(spm_, 0) == si}
         min_width = 3
         for i in range(last - first - min_width):
             if all(spm_ids[first + i + j] in removable_tokens for j in range(min_width)):
@@ -105,25 +91,11 @@ def check_details(
                     ):
                         return True
 
-    print(f"Spm: {[fast.decode([spm_ids[i]]) for i in range(first, last)]}")
-    try:
-        print(f"Tok: {[fast.decode([tok_ids[i]]) for i in range(first, last)]}")
-    except Exception as e:
-        print(f"Could not decode tok_ids: {e}")
-
-    fast.decode(spm_ids[:first])
-    fast.decode(spm_ids[last:])
-    wrong = fast.decode(spm_ids[first:last])
-    print()
-    print(wrong)
     return False
 
 
 def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, text: str) -> None:
-    global perfect
-    global imperfect
-    global wrong
-    global total
+    global perfect, imperfect, wrong, total
 
     slow_ids = slow.encode(text)
     fast_ids = fast.encode(text)
@@ -140,9 +112,6 @@ def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, te
     else:
         perfect += 1
 
-    if total % 10000 == 0:
-        print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
-
     if skip_assert:
         return
 
@@ -151,29 +120,51 @@ def test_string(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase, te
     )
 
 
-def test_tokenizer(slow: PreTrainedTokenizerBase, fast: PreTrainedTokenizerBase) -> None:
-    global batch_total
-    for i in range(len(dataset)):
-        # premise, all languages
-        for text in dataset[i]["premise"].values():
-            test_string(slow, fast, text)
-
-        # hypothesis, all languages
-        for text in dataset[i]["hypothesis"]["translation"]:
-            test_string(slow, fast, text)
+def test_tokenizer(slow, fast, dry_run=True):
+    global total, perfect, imperfect, wrong
+    total = perfect = imperfect = wrong = 0
+    n_samples = 5 if dry_run else len(dataset)
+    for i in range(n_samples):
+        premise = dataset[i]["premise"]
+        hypothesis = dataset[i]["hypothesis"]
+        test_string(slow, fast, premise)
+        test_string(slow, fast, hypothesis)
 
 
 if __name__ == "__main__":
+    DEFAULT_CHECKPOINTS = {
+        "BertTokenizer": "bert-base-uncased",
+        "BertTokenizerFast": "bert-base-uncased",
+        "AlbertTokenizer": "albert-base-v2",
+        "AlbertTokenizerFast": "albert-base-v2",
+        "BartTokenizer": "facebook/bart-base",
+        "BartTokenizerFast": "facebook/bart-base",
+        "BarthezTokenizer": "facebook/barthez",
+        "DPRReaderTokenizer": "facebook/dpr-reader-single-nq-base",
+        "DPRReaderTokenizerFast": "facebook/dpr-reader-single-nq-base",
+    }
+
     for name, (slow_class, fast_class) in TOKENIZER_CLASSES.items():
-        checkpoint_names = list(slow_class.max_model_input_sizes.keys())
-        for checkpoint in checkpoint_names:
-            imperfect = 0
-            perfect = 0
-            wrong = 0
-            total = 0
+        checkpoint = DEFAULT_CHECKPOINTS.get(name)
+        if checkpoint is None:
+            print(f"Skipping {name}: no compatible checkpoint defined")
+            continue
 
+        try:
             print(f"========================== Checking {name}: {checkpoint} ==========================")
             slow = slow_class.from_pretrained(checkpoint, force_download=True)
             fast = fast_class.from_pretrained(checkpoint, force_download=True)
-            test_tokenizer(slow, fast)
-            print(f"Accuracy {perfect * 100 / total:.2f}")
+
+            test_tokenizer(slow, fast, dry_run=True)
+
+            if total > 0:
+                print(f"Accuracy {perfect * 100 / total:.2f}% ({perfect}/{total} perfect)")
+            else:
+                print("No samples tested.")
+
+        except ImportError as e:
+            print(f"Skipping {name} due to missing dependency: {e}")
+            continue
+        except Exception as e:
+            print(f"Skipping {name} due to error: {e}")
+            continue
diff --git a/setup.py b/setup.py
index 42c865b1b9ba..a7c57f463852 100644
--- a/setup.py
+++ b/setup.py
@@ -93,7 +93,7 @@
     "kenlm",
     "kernels>=0.12.0,<0.13",
     "librosa",
-    "mistral-common[image]>=1.10.0",
+    "mistral-common[image,audio]>=1.10.0",
     "nltk<=3.8.1",
     "num2words",
     "numpy>=1.17",
@@ -165,6 +165,7 @@
     "opentelemetry-api",
     "opentelemetry-exporter-otlp",
     "opentelemetry-sdk",
+    "requests",
 ]
 
 # This is a lookup table with items like: {"tokenizers": "tokenizers==0.9.4", "packaging": "packaging"}, i.e.
@@ -192,7 +193,7 @@ def deps_list(*pkgs):
 extras["kernels"] = deps_list("kernels")
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["tiktoken"] = deps_list("tiktoken", "blobfile")
-extras["mistral-common"] = deps_list("mistral-common[image]")
+extras["mistral-common"] = deps_list("mistral-common[image,audio]")
 extras["chat_template"] = deps_list("jinja2", "jmespath")
 extras["sklearn"] = deps_list("scikit-learn")
 extras["accelerate"] = deps_list("accelerate")
@@ -205,7 +206,9 @@ def deps_list(*pkgs):
     extras["ray"] = deps_list("ray[tune]")
     extras["integrations"] += extras["ray"]
 extras["codecarbon"] = deps_list("codecarbon")
-extras["serving"] = deps_list("openai", "pydantic", "uvicorn", "fastapi", "starlette", "rich") + extras["torch"]
+extras["serving"] = (
+    deps_list("openai", "pydantic", "uvicorn", "fastapi", "starlette", "rich", "requests") + extras["torch"]
+)
 extras["num2words"] = deps_list("num2words")
 extras["benchmark"] = deps_list("optimum-benchmark")
 extras["ja"] = deps_list("fugashi", "ipadic", "unidic_lite", "unidic", "rhoknp")
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index c89618f2d9cb..9f02d5146326 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -88,6 +88,12 @@ def load_audio(audio: str | np.ndarray, sampling_rate=16000, timeout=None) -> np
         # needed. Do not raise any errors if not installed or versions do not match
         if is_torchcodec_available() and version.parse("0.3.0") <= TORCHCODEC_VERSION:
             audio = load_audio_torchcodec(audio, sampling_rate=sampling_rate, timeout=timeout)
+        elif audio.rsplit("?", 1)[0].lower().endswith((".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")):
+            raise RuntimeError(
+                f"The audio source appears to be a video file ('{audio.split('/')[-1]}'). "
+                "librosa cannot decode video containers. "
+                "Install torchcodec>=0.3.0 (`pip install torchcodec`) to load audio from video files."
+            )
         else:
             audio = load_audio_librosa(audio, sampling_rate=sampling_rate, timeout=timeout)
     elif not isinstance(audio, np.ndarray):
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 95a47ae39fdf..da144bd8897a 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -353,6 +353,24 @@ def get_max_cache_shape(self) -> int:
         """Return the maximum cache shape of the cache"""
         return self.max_cache_len
 
+    def crop(self, max_length: int) -> None:
+        """Crop the cache to the given length."""
+        if not self.is_initialized:
+            return
+
+        current_length = self.cumulative_length.item()
+
+        if max_length < 0:
+            raise ValueError(f"`max_length` passed to `StaticLayer.crop()` must be >= 0, got {max_length}.")
+
+        if max_length >= current_length:
+            return
+
+        self.keys[:, :, max_length:, :].zero_()
+        self.values[:, :, max_length:, :].zero_()
+
+        self.cumulative_length.fill_(max_length)
+
 
 class StaticSlidingWindowLayer(StaticLayer):
     """
@@ -531,6 +549,14 @@ def update(
             self._quantized_values = self._quantize(value_states.contiguous(), axis=self.axis_value)
             return key_states, value_states
 
+        # After reset, quantized data is cleared
+        if self._quantized_keys is None:
+            self._quantized_keys = self._quantize(key_states.contiguous(), axis=self.axis_key)
+            self._quantized_values = self._quantize(value_states.contiguous(), axis=self.axis_value)
+            self.keys = torch.tensor([], dtype=key_states.dtype, device=key_states.device)
+            self.values = torch.tensor([], dtype=key_states.dtype, device=key_states.device)
+            return key_states, value_states
+
         dequant_keys = self._dequantize(self._quantized_keys)
         dequant_values = self._dequantize(self._quantized_values)
         keys_to_return = torch.cat([dequant_keys, self.keys, key_states], dim=-2)
@@ -552,6 +578,11 @@ def _quantize(self, tensor, axis): ...
     @abstractmethod
     def _dequantize(self, q_tensor): ...
 
+    def reset(self) -> None:
+        super().reset()
+        self._quantized_keys = None
+        self._quantized_values = None
+
     def get_seq_length(self) -> int:
         """Returns the sequence length of the cached states."""
         return self.cumulative_length
@@ -1337,6 +1368,17 @@ def __init__(
         offload_only_non_sliding: bool = True,
         **kwargs,
     ):
+        if kwargs:
+            raise TypeError(f"Unknown arguments passed to StaticCache: {list(kwargs.keys())}")
+
+        if not isinstance(offloading, bool):
+            raise TypeError(
+                f"`offloading` must be a bool, got {type(offloading)}. "
+                "Did you accidentally pass `device` as a positional argument?"
+            )
+        if not isinstance(offload_only_non_sliding, bool):
+            raise TypeError(f"`offload_only_non_sliding` must be a bool, got {type(offload_only_non_sliding)}.")
+
         config = config.get_text_config(decoder=True)
         layer_types = getattr(config, "layer_types", None)
         # If `layer_types` is not explicitly provided, infer if the model is fully sliding
diff --git a/src/transformers/cli/serve.py b/src/transformers/cli/serve.py
index 3d7c6a0c51ba..77fd7b134e01 100644
--- a/src/transformers/cli/serve.py
+++ b/src/transformers/cli/serve.py
@@ -150,6 +150,7 @@ def __init__(
             completion_handler=self._completion_handler,
             response_handler=self._response_handler,
             transcription_handler=self._transcription_handler,
+            generation_state=self._generation_state,
             enable_cors=enable_cors,
         )
 
diff --git a/src/transformers/cli/serving/server.py b/src/transformers/cli/serving/server.py
index 13a9565db590..f3fc46e9ad1c 100644
--- a/src/transformers/cli/serving/server.py
+++ b/src/transformers/cli/serving/server.py
@@ -32,7 +32,7 @@
 from .model_manager import ModelManager
 from .response import ResponseHandler
 from .transcription import TranscriptionHandler
-from .utils import X_REQUEST_ID
+from .utils import X_REQUEST_ID, CBWorkerDeadError, GenerationState
 
 
 logger = logging.get_logger(__name__)
@@ -44,6 +44,7 @@ def build_server(
     completion_handler: CompletionHandler,
     response_handler: ResponseHandler,
     transcription_handler: TranscriptionHandler,
+    generation_state: GenerationState,
     enable_cors: bool = False,
 ) -> FastAPI:
     """Build and return a configured FastAPI application.
@@ -52,6 +53,7 @@ def build_server(
         model_manager: Handles model loading, caching, and cleanup.
         chat_handler: Handles `/v1/chat/completions` requests.
         response_handler: Handles `/v1/responses` requests.
+        generation_state: Shared generation state, used by `/health` to report CB liveness.
         enable_cors: If `True`, adds permissive CORS middleware (allow all origins).
 
     Returns:
@@ -65,6 +67,12 @@ async def lifespan(app: FastAPI):
 
     app = FastAPI(lifespan=lifespan)
 
+    @app.exception_handler(CBWorkerDeadError)
+    async def _cb_dead_handler(_request: Request, exc: CBWorkerDeadError):
+        # CB worker died (e.g. CUDA illegal memory access); reject new requests with 503
+        # carrying the cause, instead of letting them hang in the input queue forever.
+        return JSONResponse({"error": str(exc)}, status_code=503)
+
     if enable_cors:
         app.add_middleware(
             CORSMiddleware,
@@ -128,6 +136,8 @@ def list_models():
 
     @app.get("/health")
     def health():
+        if not generation_state.is_cb_alive():
+            return JSONResponse({"status": "unhealthy", "reason": "cb_worker_dead"}, status_code=503)
         return JSONResponse({"status": "ok"})
 
     return app
diff --git a/src/transformers/cli/serving/utils.py b/src/transformers/cli/serving/utils.py
index d786a828fc28..165a56e8ddd7 100644
--- a/src/transformers/cli/serving/utils.py
+++ b/src/transformers/cli/serving/utils.py
@@ -73,6 +73,14 @@ class _GenerationCancelled(Exception):
     """Raised inside ``DirectStreamer.put()`` to abort ``model.generate()``."""
 
 
+class CBWorkerDeadError(RuntimeError):
+    """Raised when a request is submitted to a CB worker that has died.
+
+    Surfaced as 503 by the FastAPI exception handler. Carries the original error message
+    that killed the worker so the client knows why the server is in this state.
+    """
+
+
 # Fallback tool call configs for models that don't declare stc_token/etc_token/response_schema
 # on their tokenizer.
 # Keys are matched via substring against model_type (e.g. "qwen" matches "qwen2", "qwen3_vl", etc.).
@@ -635,6 +643,21 @@ def init_cb(self, model: "PreTrainedModel", gen_config: "GenerationConfig") -> N
         )
         self._cb.start()
 
+    def is_alive(self) -> bool:
+        """Whether the CB worker is healthy and able to serve new requests."""
+        return self._cb is not None and self._cb.fatal_error is None
+
+    def _check_alive(self, request_id: str) -> None:
+        """Raise :class:`CBWorkerDeadError` if the CB worker has died.
+
+        Called at request entry to fail fast — submitting to a dead worker would otherwise
+        enqueue the request into a void where it never gets processed.
+        """
+        if self._cb is not None and self._cb.fatal_error is not None:
+            raise CBWorkerDeadError(
+                f"CB worker is dead and cannot accept request {request_id}: {self._cb.fatal_error}"
+            )
+
     def generate_streaming(
         self,
         model: "PreTrainedModel",
@@ -648,6 +671,7 @@ def generate_streaming(
         cb = self._cb
         if cb is None:
             raise RuntimeError("CB manager not initialized. Call `init_cb()` first.")
+        self._check_alive(request_id)
 
         loop = asyncio.get_running_loop()
         text_queue: asyncio.Queue = asyncio.Queue()
@@ -669,7 +693,13 @@ def generate_streaming(
         def _on_output(output):
             try:
                 streamer.put(output)
-                if output.is_finished():
+                # ``error`` is set together with ``status = FAILED`` in CB's _handle_request_error.
+                # Surface it as an end-of-stream error so the SSE handler can emit it and close,
+                # instead of leaving the client hanging on a stream that will never end.
+                if output.error is not None:
+                    text_queue.put_nowait(_StreamError(output.error))
+                    streamer.end()
+                elif output.is_finished():
                     streamer.end()
             except Exception as e:
                 text_queue.put_nowait(_StreamError(str(e)))
@@ -689,6 +719,7 @@ async def generate_non_streaming(
         cb = self._cb
         if cb is None:
             raise RuntimeError("CB manager not initialized. Call `init_cb()` first.")
+        self._check_alive(request_id)
 
         input_ids = inputs["input_ids"]
         input_len = len(input_ids)
@@ -711,8 +742,16 @@ def _on_result(result):
             eos_token_id=gen_config.eos_token_id,
         )
         result = await future
-        if result is None:
-            raise RuntimeError(f"CB manager stopped before producing a result for {request_id}")
+        # CB signals a failed request by setting ``error`` (and ``status = FAILED``) on the
+        # delivered GenerationOutput, often with empty ``generated_tokens``. Surface it instead
+        # of returning an empty success that downstream parsing/decoding would silently mask.
+        # If the worker itself died, route to CBWorkerDeadError so the client gets the same 503
+        # as requests submitted post-crash; otherwise it's a per-request failure (e.g. unsupported
+        # logit-processor kwarg) and a plain RuntimeError -> 500 is appropriate.
+        if result.error is not None:
+            if self._cb.fatal_error is not None:
+                raise CBWorkerDeadError(f"CB worker died during request {request_id}: {result.error}")
+            raise RuntimeError(f"CB generation failed for {request_id}: {result.error}")
         generated_ids = result.generated_tokens
         text = processor.decode(generated_ids, skip_special_tokens=True)
         return text, input_len, generated_ids
@@ -805,6 +844,12 @@ def shutdown(self) -> None:
             self._cb_manager.stop()
             self._cb_manager = None
 
+    def is_cb_alive(self) -> bool:
+        """Whether the CB worker is healthy. ``True`` if CB is disabled or not yet initialized."""
+        if self._cb_manager is None:
+            return True
+        return self._cb_manager.is_alive()
+
 
 class BaseHandler:
     """Shared logic for chat completion and responses handlers.
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 2dcdc5333f35..073b23172251 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -21,7 +21,7 @@
 from collections.abc import Sequence
 from dataclasses import MISSING, dataclass, fields
 from functools import wraps
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, Union
+from typing import Any, ClassVar, Literal, TypeVar
 
 from huggingface_hub import create_repo
 from huggingface_hub.dataclasses import strict
@@ -43,10 +43,7 @@
     logging,
 )
 from .utils.generic import is_timm_config_dict
-
-
-if TYPE_CHECKING:
-    import torch
+from .utils.type_validators import dtype_validator
 
 
 logger = logging.get_logger(__name__)
@@ -229,7 +226,7 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
     # Common attributes for all models
     output_hidden_states: bool | None = False
     return_dict: bool | None = True
-    dtype: Union[str, "torch.dtype"] | None = None
+    dtype: Any = dtype_validator(default=None)
     chunk_size_feed_forward: int = 0
     is_encoder_decoder: bool = False
 
@@ -1161,6 +1158,7 @@ def _remove_keys_not_serialized(self, d: dict[str, Any]) -> None:
             "ignore_keys_at_rope_validation",
             "base_model_tp_plan",
             "base_model_pp_plan",
+            "distributed_config",
         ]:
             d.pop(key_to_remove, None)
 
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
index dadfeb4224ad..5a865164747c 100755
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -62,19 +62,8 @@
     "rt_detr_v2": "rt_detr",
     "pp_doclayout_v2": "rt_detr",
     "pp_doclayout_v3": "rt_detr",
-    "paligemma": "llava",
-    "aya_vision": "llava",
-    "got_ocr2": "llava",
-    "shieldgemma2": "llava",
-    "gemma3": "llava",
-    "internvl": "llava",
-    "llava_next_video": "llava_next",
-    "llava_onevision": "llava_next",
-    "vipllava": "llava",
-    "mistral3": "llava",
     "qwen2_5_vl": "qwen2_vl",
     "sam3_tracker_video": "sam3_tracker",
-    "pp_chart2table": "llava",
     "altclip_vision_model": "clip_vision_model",
     "chinese_clip_vision_model": "clip_vision_model",
     "clipseg_vision_model": "clip_vision_model",
@@ -89,6 +78,32 @@
     "siglip_text_model": "clip_text_model",
     "siglip2_text_model": "clip_text_model",
     "xclip_text_model": "clip_text_model",
+    "shield_gemma2": "llava",
+    "paligemma": "llava",
+    "aya_vision": "llava",
+    "got_ocr2": "llava",
+    "gemma3": "llava",
+    "internvl": "llava",
+    "vipllava": "llava",
+    "mistral3": "llava",
+    "pp_chart2table": "llava",
+    "llava_next_video": "llava_next",
+    "llava_onevision": "llava_next",
+    # class-based mappings
+    "PaliGemmaModel": "LlavaModel",
+    "AyaVisionModel": "LlavaModel",
+    "GotOcr2Model": "LlavaModel",
+    "Gemma3Model": "LlavaModel",
+    "InternVLModel": "LlavaModel",
+    "VipLlavaModel": "LlavaModel",
+    "Mistral3Model": "LlavaModel",
+    "PPChart2TableModel": "LlavaModel",
+    "LlavaNextModel": "LlavaModel",
+    "LlavaNextVideoModel": "LlavaModel",
+    "LlavaOnevisionModel": "LlavaModel",
+    "FuyuModel": "LlavaModel",
+    "MllamaModel": "LlavaModel",
+    "Qwen2_5_VLModel": "Qwen2VLModel",
 }
 
 
@@ -97,42 +112,55 @@ def _build_checkpoint_conversion_mapping():
         "altclip": [
             WeightRenaming(source_patterns=r"layer\.", target_patterns="layers."),
         ],
+        "LlavaModel": [
+            WeightRenaming(source_patterns=r"^language_model.model", target_patterns="language_model"),
+        ],
         "llava": [
-            WeightRenaming(source_patterns=r"^language_model.model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^language_model.lm_head", target_patterns="lm_head"),
+            WeightRenaming(source_patterns=r"^language_model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^vision_tower", target_patterns="model.vision_tower"),
             WeightRenaming(source_patterns=r"^multi_modal_projector", target_patterns="model.multi_modal_projector"),
         ],
         "llava_next": [
-            WeightRenaming(source_patterns=r"^language_model.model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^language_model.lm_head", target_patterns="lm_head"),
+            WeightRenaming(source_patterns=r"^language_model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^vision_tower", target_patterns="model.vision_tower"),
             WeightRenaming(source_patterns=r"^multi_modal_projector", target_patterns="model.multi_modal_projector"),
             WeightRenaming(source_patterns=r"^image_newline", target_patterns="model.image_newline"),
         ],
-        "clip_vision_model": [PrefixChange(prefix_to_remove="vision_model")],
+        "clip_vision_model": [
+            PrefixChange(prefix_to_remove="vision_model"),
+            # Keep old CLIP-like checkpoints loadable after fixing the historical typo in module names.
+            WeightRenaming(source_patterns=r"layrnorm", target_patterns="layernorm"),
+        ],
         "clip_text_model": [PrefixChange(prefix_to_remove="text_model")],
+        "VideoLlavaModel": [
+            WeightRenaming(source_patterns=r"^language_model.model", target_patterns="language_model"),
+        ],
         "video_llava": [
-            WeightRenaming(source_patterns=r"^language_model.model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^language_model.lm_head", target_patterns="lm_head"),
+            WeightRenaming(source_patterns=r"^language_model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^image_tower", target_patterns="model.image_tower"),
             WeightRenaming(source_patterns=r"^video_tower", target_patterns="model.video_tower"),
             WeightRenaming(source_patterns=r"^multi_modal_projector", target_patterns="model.multi_modal_projector"),
         ],
         "fuyu": [
-            WeightRenaming(source_patterns=r"^language_model.model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^language_model.lm_head", target_patterns="lm_head"),
+            WeightRenaming(source_patterns=r"^language_model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^vision_embed_tokens", target_patterns="model.vision_embed_tokens"),
         ],
         "mllama": [
-            WeightRenaming(source_patterns=r"^language_model.model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^language_model.lm_head", target_patterns="lm_head"),
+            WeightRenaming(source_patterns=r"^language_model", target_patterns="model.language_model"),
             WeightRenaming(source_patterns=r"^vision_model", target_patterns="model.vision_model"),
             WeightRenaming(source_patterns=r"^multi_modal_projector", target_patterns="model.multi_modal_projector"),
         ],
+        "Emu3Model": [
+            WeightRenaming(source_patterns=r"^text_model.model", target_patterns="text_model"),
+        ],
         "emu3": [
-            WeightRenaming(source_patterns=r"^text_model.model", target_patterns="model.text_model"),
             WeightRenaming(source_patterns=r"^text_model.lm_head", target_patterns="lm_head"),
+            WeightRenaming(source_patterns=r"^text_model", target_patterns="model.text_model"),
             WeightRenaming(source_patterns=r"^vqmodel", target_patterns="model.vqmodel"),
         ],
         "paddleocr_vl": [
@@ -143,15 +171,12 @@ def _build_checkpoint_conversion_mapping():
                 target_patterns="model.language_model",
             ),
         ],
+        "Qwen2VLModel": [WeightRenaming(source_patterns=r"^model.", target_patterns="")],
         "qwen2_vl": [
+            WeightRenaming(source_patterns=r"^visual", target_patterns="model.visual"),
             WeightRenaming(
                 source_patterns=r"(?<!_)model(?!\.(language_model|visual))", target_patterns="model.language_model"
             ),
-            WeightRenaming(source_patterns=r"^visual", target_patterns="model.visual"),
-        ],
-        "colqwen2": [
-            PrefixChange(prefix_to_remove="model", model_prefix="vlm"),
-            WeightRenaming(source_patterns=r"vlm(?!\.(language_model|visual))", target_patterns="vlm.language_model"),
         ],
         "timm_wrapper": [PrefixChange(prefix_to_add="timm_model")],
         "pi0": [
@@ -343,29 +368,11 @@ def _build_checkpoint_conversion_mapping():
                 operations=[ErnieFuseAndSplitTextVisionExperts(stack_dim=0, concat_dim=1)],
             ),
         ],
-        "detr": [
+        "DetrModel": [
             WeightRenaming("backbone.conv_encoder", "backbone"),
             WeightRenaming("out_proj", "o_proj"),
             WeightRenaming(r"layers.(\d+).fc1", r"layers.\1.mlp.fc1"),
             WeightRenaming(r"layers.(\d+).fc2", r"layers.\1.mlp.fc2"),
-            # `DetrForSegmentation`
-            WeightRenaming("bbox_attention.q_linear", "bbox_attention.q_proj"),
-            WeightRenaming("bbox_attention.k_linear", "bbox_attention.k_proj"),
-            # Mask head refactor
-            WeightRenaming("mask_head.lay1", "mask_head.conv1.conv"),
-            WeightRenaming("mask_head.gn1", "mask_head.conv1.norm"),
-            WeightRenaming("mask_head.lay2", "mask_head.conv2.conv"),
-            WeightRenaming("mask_head.gn2", "mask_head.conv2.norm"),
-            WeightRenaming("mask_head.adapter1", "mask_head.fpn_stages.0.fpn_adapter"),
-            WeightRenaming("mask_head.lay3", "mask_head.fpn_stages.0.refine.conv"),
-            WeightRenaming("mask_head.gn3", "mask_head.fpn_stages.0.refine.norm"),
-            WeightRenaming("mask_head.adapter2", "mask_head.fpn_stages.1.fpn_adapter"),
-            WeightRenaming("mask_head.lay4", "mask_head.fpn_stages.1.refine.conv"),
-            WeightRenaming("mask_head.gn4", "mask_head.fpn_stages.1.refine.norm"),
-            WeightRenaming("mask_head.adapter3", "mask_head.fpn_stages.2.fpn_adapter"),
-            WeightRenaming("mask_head.lay5", "mask_head.fpn_stages.2.refine.conv"),
-            WeightRenaming("mask_head.gn5", "mask_head.fpn_stages.2.refine.norm"),
-            WeightRenaming("mask_head.out_lay", "mask_head.output_conv"),
         ],
         "rt_detr": [
             WeightRenaming("out_proj", "o_proj"),
@@ -373,7 +380,7 @@ def _build_checkpoint_conversion_mapping():
             WeightRenaming(r"layers.(\d+).fc2", r"layers.\1.mlp.fc2"),
             WeightRenaming(r"encoder.encoder.(\d+).layers", r"encoder.aifi.\1.layers"),
         ],
-        "conditional_detr": [
+        "ConditionalDetrModel": [
             WeightRenaming("backbone.conv_encoder", "backbone"),
             WeightRenaming("self_attn.out_proj", "self_attn.o_proj"),
             WeightRenaming("encoder_attn.out_proj", "encoder_attn.o_proj"),
@@ -394,24 +401,6 @@ def _build_checkpoint_conversion_mapping():
             WeightRenaming(
                 r"decoder.layers.(\d+).ca_qpos_sine_proj", r"decoder.layers.\1.encoder_attn.q_pos_sine_proj"
             ),
-            # The rest of patterns are used only in `ConditionalDetrForSegmentation`
-            WeightRenaming("bbox_attention.q_linear", "bbox_attention.q_proj"),
-            WeightRenaming("bbox_attention.k_linear", "bbox_attention.k_proj"),
-            # Mask head refactor
-            WeightRenaming("mask_head.lay1", "mask_head.conv1.conv"),
-            WeightRenaming("mask_head.gn1", "mask_head.conv1.norm"),
-            WeightRenaming("mask_head.lay2", "mask_head.conv2.conv"),
-            WeightRenaming("mask_head.gn2", "mask_head.conv2.norm"),
-            WeightRenaming("mask_head.adapter1", "mask_head.fpn_stages.0.fpn_adapter"),
-            WeightRenaming("mask_head.lay3", "mask_head.fpn_stages.0.refine.conv"),
-            WeightRenaming("mask_head.gn3", "mask_head.fpn_stages.0.refine.norm"),
-            WeightRenaming("mask_head.adapter2", "mask_head.fpn_stages.1.fpn_adapter"),
-            WeightRenaming("mask_head.lay4", "mask_head.fpn_stages.1.refine.conv"),
-            WeightRenaming("mask_head.gn4", "mask_head.fpn_stages.1.refine.norm"),
-            WeightRenaming("mask_head.adapter3", "mask_head.fpn_stages.2.fpn_adapter"),
-            WeightRenaming("mask_head.lay5", "mask_head.fpn_stages.2.refine.conv"),
-            WeightRenaming("mask_head.gn5", "mask_head.fpn_stages.2.refine.norm"),
-            WeightRenaming("mask_head.out_lay", "mask_head.output_conv"),
         ],
         "deformable_detr": [
             WeightRenaming("backbone.conv_encoder", "backbone"),
@@ -458,6 +447,16 @@ def _build_checkpoint_conversion_mapping():
                 operations=[MergeModulelist(dim=0)],
             ),
         ],
+        "pe_audio_video_encoder": [
+            WeightRenaming(
+                source_patterns=r"audio_model\.audio_encoder\.(.+)",
+                target_patterns=r"embedder.audio_encoder.\1",
+            ),
+            WeightRenaming(
+                source_patterns=r"video_model\.video_encoder\.(.+)",
+                target_patterns=r"embedder.video_encoder.\1",
+            ),
+        ],
         "nomic_bert": [
             WeightRenaming(r"encoder.layers", r"layers"),
             WeightRenaming(r"emb_ln", r"embeddings.LayerNorm"),
@@ -579,6 +578,27 @@ def _build_checkpoint_conversion_mapping():
             target_patterns=".parametrizations.weight.original1",
         ),
     ]
+    # Base DetrModel/ConditionalDetrModel transforms are picked up automatically as
+    # scoped sub-module transforms; only the segmentation-specific patterns are needed here.
+    mapping["DetrForSegmentation"] = [
+        WeightRenaming("bbox_attention.q_linear", "bbox_attention.q_proj"),
+        WeightRenaming("bbox_attention.k_linear", "bbox_attention.k_proj"),
+        WeightRenaming("mask_head.lay1", "mask_head.conv1.conv"),
+        WeightRenaming("mask_head.gn1", "mask_head.conv1.norm"),
+        WeightRenaming("mask_head.lay2", "mask_head.conv2.conv"),
+        WeightRenaming("mask_head.gn2", "mask_head.conv2.norm"),
+        WeightRenaming("mask_head.adapter1", "mask_head.fpn_stages.0.fpn_adapter"),
+        WeightRenaming("mask_head.lay3", "mask_head.fpn_stages.0.refine.conv"),
+        WeightRenaming("mask_head.gn3", "mask_head.fpn_stages.0.refine.norm"),
+        WeightRenaming("mask_head.adapter2", "mask_head.fpn_stages.1.fpn_adapter"),
+        WeightRenaming("mask_head.lay4", "mask_head.fpn_stages.1.refine.conv"),
+        WeightRenaming("mask_head.gn4", "mask_head.fpn_stages.1.refine.norm"),
+        WeightRenaming("mask_head.adapter3", "mask_head.fpn_stages.2.fpn_adapter"),
+        WeightRenaming("mask_head.lay5", "mask_head.fpn_stages.2.refine.conv"),
+        WeightRenaming("mask_head.gn5", "mask_head.fpn_stages.2.refine.norm"),
+        WeightRenaming("mask_head.out_lay", "mask_head.output_conv"),
+    ]
+    mapping["ConditionalDetrForSegmentation"] = mapping["DetrForSegmentation"].copy()
 
     mapping["ernie4_5_moe"] = mapping["qwen2_moe"].copy()
     mapping["ernie4_5_moe"] += [
@@ -627,30 +647,46 @@ def get_checkpoint_conversion_mapping(model_type):
 
 
 def register_checkpoint_conversion_mapping(
-    model_type: str,
+    model_type_or_class_name: str,
     mapping: list[WeightConverter | WeightRenaming],
     overwrite: bool = False,
 ) -> None:
+    """
+    Register a conversion mapping for a model type string or a class name.
+
+    Class names take priority over ``model_type`` strings during lookup (see
+    :func:`extract_weight_conversions_for_model`), making it possible to define
+    task-head-specific or class-specific conversions that differ from the shared
+    ``model_type`` baseline.
+    """
     global _checkpoint_conversion_mapping_cache
     if _checkpoint_conversion_mapping_cache is None:
         _checkpoint_conversion_mapping_cache = _build_checkpoint_conversion_mapping()
-    if model_type in _checkpoint_conversion_mapping_cache and not overwrite:
-        raise ValueError(f"Model type {model_type} already exists in the checkpoint conversion mapping.")
-    _checkpoint_conversion_mapping_cache[model_type] = mapping
+    if model_type_or_class_name in _checkpoint_conversion_mapping_cache and not overwrite:
+        raise ValueError(
+            f"Conversion mapping for '{model_type_or_class_name}' already exists. Pass overwrite=True to replace it."
+        )
+    _checkpoint_conversion_mapping_cache[model_type_or_class_name] = mapping
 
 
-def extract_weight_conversions_for_model(model: PreTrainedModel, model_prefix: str) -> list[WeightTransform] | None:
+def extract_weight_conversions_for_model(
+    model: PreTrainedModel,
+) -> list[WeightTransform] | None:
+    """
+    Return the registered conversion list for ``model``, or ``None`` if none exists.
+
+    Looks up by class name first (enables task-head-specific overrides), then
+    falls back to ``model.config.model_type``.  Transforms are returned
+    unmodified; the caller sets ``scope_prefix`` on each transform for sub-module isolation.
+    """
+    class_name = type(model).__name__
     model_type = getattr(model.config, "model_type", None)
-    if model_type is not None:
-        model_specific_conversions = get_checkpoint_conversion_mapping(model_type)
-        # In this case, add the prefix to `PrefixChange` instances, in order to know where to add/remove the prefix
-        if model_specific_conversions is not None and model_prefix != "":
-            for i, conversion in enumerate(model_specific_conversions):
-                # In this case, add the prefix, as otherwise we don't know where we need to re-add it exactly in the module name chain
-                if isinstance(conversion, PrefixChange):
-                    model_specific_conversions[i] = conversion.with_submodel_prefix(model_prefix)
-        return model_specific_conversions
-    return None
+
+    # Class name takes priority — allows ForXxx-specific overrides
+    conversions = get_checkpoint_conversion_mapping(class_name)
+    if conversions is None and model_type is not None:
+        conversions = get_checkpoint_conversion_mapping(model_type)
+    return conversions
 
 
 def get_model_conversion_mapping(
@@ -660,11 +696,17 @@ def get_model_conversion_mapping(
     add_legacy: bool = True,
 ) -> list[WeightTransform]:
     """
-    For a given `model`, obtain the weight conversion mapping if any are registered either as a simple renaming
-    `_checkpoint_conversion_mapping` class argument, or in the general WeightConverter mapping.
+    Collect the ordered list of weight transforms for ``model`` (used during
+    loading and, when reversed, during saving).
+
+    Each ``PreTrainedModel`` sub-module is looked up by class name then
+    ``model_type``.  Root transforms are applied globally; sub-module transforms
+    have their ``scope_prefix`` set so they only match keys under that prefix.  After any
+    sub-module is processed, both its class name and ``model_type`` are marked
+    seen to prevent ``XForY`` / ``XModel`` pairs from applying the same mapping
+    twice via different lookup paths.
     """
     # Lazy import to avoid circular import issues
-    from .modeling_utils import PreTrainedModel
 
     # note: this function is used in PEFT, so changing the API requires coordination
     weight_conversions = []
@@ -673,16 +715,45 @@ def get_model_conversion_mapping(
     if key_mapping is not None:
         weight_conversions = [WeightRenaming(source_patterns=k, target_patterns=v) for k, v in key_mapping.items()]
 
-    # Model have several `PreTrainedModel` within with the same model type, for example: XForConditionalGeneration -> XModel
-    # We don't want to apply the same conversion pattern twice because of that
-    seen_model_types = set()
-    # Recurse over submodules and collect all conversions
-    for name, submodule in model.named_modules():
-        if isinstance(submodule, PreTrainedModel) and submodule.config.model_type not in seen_model_types:
-            conversions = extract_weight_conversions_for_model(submodule, name)
-            if conversions is not None:
-                weight_conversions.extend(conversions)
-                seen_model_types.add(submodule.config.model_type)
+    seen_identifiers: set[str] = set()
+
+    named_pretrained = getattr(model, "_named_pretrained_submodules", None)
+    if named_pretrained is None:
+        from .modeling_utils import PreTrainedModel
+
+        named_pretrained = [(name, m) for name, m in model.named_modules() if isinstance(m, PreTrainedModel)]
+    for module_name, submodule in named_pretrained:
+        class_name = type(submodule).__name__
+        model_type = getattr(submodule.config, "model_type", None)
+
+        # Skip if this architecture was already processed via either lookup path.
+        if class_name in seen_identifiers or (model_type and model_type in seen_identifiers):
+            continue
+
+        # Try class name first, then model_type. Track which path produced the hit so
+        # we know whether to block model_type for subsequent sub-modules (see below).
+        conversions = get_checkpoint_conversion_mapping(class_name)
+        found_via_class = conversions is not None
+        if not found_via_class and model_type is not None:
+            conversions = get_checkpoint_conversion_mapping(model_type)
+
+        if conversions is None:
+            continue
+
+        is_root_model = module_name == ""
+        if not is_root_model:
+            # Scope each transform so it only matches keys under this sub-module's prefix.
+            for transform in conversions:
+                transform.scope_prefix = module_name
+        weight_conversions.extend(conversions)
+
+        seen_identifiers.add(class_name)
+        # Only block model_type when the hit was via model_type. When the hit was via
+        # class name, sub-modules that share the same model_type but have no class-specific
+        # mapping of their own (e.g. DetrModel under DetrForSegmentation) must still be
+        # reachable so their base transforms are picked up and scoped automatically.
+        if not found_via_class and model_type:
+            seen_identifiers.add(model_type)
 
     if add_legacy:
         weight_conversions.extend(get_checkpoint_conversion_mapping("legacy"))
diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
index cd0710649c91..0e9e88dfbc83 100644
--- a/src/transformers/core_model_loading.py
+++ b/src/transformers/core_model_loading.py
@@ -591,6 +591,7 @@ class WeightTransform:
         "_original_source_patterns",
         "_original_target_patterns",
         "_was_used",
+        "scope_prefix",
     )
 
     def __init__(self, source_patterns: str | list[str], target_patterns: str | list[str]):
@@ -608,6 +609,9 @@ def __init__(self, source_patterns: str | list[str], target_patterns: str | list
 
         # Flag to notice if the Transform was used
         self._was_used = False
+        # Optional prefix scope: when set, this transform only applies to keys starting with
+        # ``scope_prefix + "."``, stripping / re-attaching the prefix around the pattern match.
+        self.scope_prefix: str | None = None
 
         # We need to process a few exceptions here when instantiating the reverse mapping (i.e. the targets become
         # sources, and sources become targets). The issues lie in the sources usually, so here we need to check the
@@ -673,6 +677,27 @@ def add_tensor(self, target_key: str, source_key: str, source_pattern: str, futu
         self.collected_tensors[source_pattern].append(future)
         self.layer_targets[target_key].add(source_key)
 
+    def _scoped_match(self, source_key: str) -> tuple[str | None, str, re.Match[str]] | None:
+        """
+        Apply ``scope_prefix`` stripping (if any), then match ``compiled_sources`` against the suffix.
+
+        Returns ``(prefix_dot, key_to_match, match_object)`` when a branch matches, where ``prefix_dot`` is ``None``
+        if ``scope_prefix`` is unset, else ``f"{scope_prefix}."``. Returns ``None`` when out of scope or unmatched.
+        Does not set ``_was_used``.
+        """
+        prefix_dot = None
+        key_to_match = source_key
+        if self.scope_prefix is not None:
+            prefix_dot = self.scope_prefix + "."
+            if not source_key.startswith(prefix_dot):
+                return None
+            key_to_match = source_key[len(prefix_dot) :]
+
+        match_object = self.compiled_sources.search(key_to_match)
+        if match_object is None:
+            return None
+        return (prefix_dot, key_to_match, match_object)
+
     def rename_source_key(self, source_key: str) -> tuple[str, str | None]:
         """
         Return a tuple (renamed_key, source_pattern_producing_the_match).
@@ -680,11 +705,12 @@ def rename_source_key(self, source_key: str) -> tuple[str, str | None]:
         In case of a one-to-many transform, i.e. we have several target patterns, the matching source pattern
         will be replaced by the first of all the target patterns (they are then correctly expanded in the Operations).
         """
-        # Try matching one of the alternation branches
-        match_object = self.compiled_sources.search(source_key)
-        if match_object is None:
+        matched = self._scoped_match(source_key)
+        if matched is None:
             return source_key, None
 
+        prefix_dot, key_to_match, match_object = matched
+
         # We have a match, so the Transform was used
         self._was_used = True
 
@@ -699,7 +725,9 @@ def rename_source_key(self, source_key: str) -> tuple[str, str | None]:
             # inside that matched named group
             replaced_group_idx = self.compiled_sources.groupindex[matching_group_name] + 1
             replacement = replacement.replace(r"\1", match_object.group(replaced_group_idx))
-        renamed_key = source_key.replace(match_object.group(0), replacement, 1)
+        renamed_key = key_to_match.replace(match_object.group(0), replacement, 1)
+        if prefix_dot is not None:
+            renamed_key = prefix_dot + renamed_key
         return renamed_key, source_pattern_that_matched
 
     def reverse_transform(self) -> WeightTransform:
@@ -717,7 +745,7 @@ def reverse_transform(self) -> WeightTransform:
         reverse_transform = self.__class__(
             source_patterns=self._original_target_patterns, target_patterns=self._original_source_patterns, **kwargs
         )
-
+        reverse_transform.scope_prefix = self.scope_prefix
         return reverse_transform
 
     def materialize_tensors(self) -> dict[str, list[torch.Tensor]]:
@@ -836,15 +864,11 @@ def reverse_transform(self) -> WeightTransform:
             raise ValueError("Cannot reverse the transform with TP or quantization")
 
         # Only one of the 2 can ever be used, so 1 is always None
-        return PrefixChange(
+        result = PrefixChange(
             prefix_to_add=self.prefix_to_remove, prefix_to_remove=self.prefix_to_add, model_prefix=self.model_prefix
         )
-
-    def with_submodel_prefix(self, prefix: str) -> PrefixChange:
-        new_prefix = f"{prefix}.{self.model_prefix}" if self.model_prefix != "" else prefix
-        return PrefixChange(
-            prefix_to_add=self.prefix_to_add, prefix_to_remove=self.prefix_to_remove, model_prefix=new_prefix
-        )
+        result.scope_prefix = self.scope_prefix
+        return result
 
 
 # List of classes that are known to be able to use m:n
@@ -1077,6 +1101,8 @@ def set_param_for_module(
         if ref is not None and param_value.shape != expected_shape and hf_quantizer is None:
             loading_info.mismatched_keys.add((target_name, param_value.shape, expected_shape))
         else:
+            if distributed_operation is not None:
+                param_value = distributed_operation.post_shard_wrap(param_value)
             # super important otherwise _init_weight will re-init the param
             param_value._is_hf_initialized = True
             setattr(module_obj, param_name, param_value)
@@ -1112,30 +1138,50 @@ class SkipParameters(Exception):
 
 def rename_source_key(
     source_key: str,
-    weight_renamings: list[WeightRenaming],
-    weight_converters: list[WeightConverter],
+    weight_transforms: list[WeightTransform],
     prefix: str | None = None,
     meta_state_dict: dict | None = None,
 ) -> tuple[str, str | None]:
     """
-    Rename a source key given all the renaming and weight conversion patterns we have. Also takes care of adding/removing
-    the base model prefix during loading if necessary.
+    Rename a source key according to ``weight_transforms``, also handling the base model prefix.
+
+    Transforms are applied in list order, interleaving ``WeightRenaming`` and ``WeightConverter``
+    instances as they appear.  The same list, reversed and with each transform individually
+    inverted, is used on the save path, so relative ordering is preserved in both directions.
+
+    At most one ``WeightConverter`` fires per key; subsequent converters are skipped.
+    ``WeightRenaming`` always runs, even after a converter has already fired.
+
+    Example (root rename followed by a scoped sub-model converter)::
+
+        transforms = [
+            WeightRenaming("^old_prefix", "model.vlm"),
+            WeightConverter("^q_proj", "qkv_proj", ...),  # scope_prefix="model.vlm"
+        ]
+        # Load:  "old_prefix.q_proj"
+        #   → WeightRenaming  → "model.vlm.q_proj"
+        #   → WeightConverter → "model.vlm.qkv_proj"
+        #
+        # Save (inverted list, each transform reversed):
+        #   "model.vlm.q_proj"
+        #   → rev(WeightConverter) → "model.vlm.q_proj"
+        #   → rev(WeightRenaming)  → "old_prefix.q_proj"
     """
     renamed_key = source_key
-    # 1. apply all renamings in turns (if multiple match, it's the responsibility of the mappings to make sure they
-    # are coherent)
-    for renaming in weight_renamings:
-        renamed_key, _ = renaming.rename_source_key(renamed_key)
-
-    # 2. apply renaming through weight conversions on the key if we have any WeightConverter (here we stop after
-    # the first match, as we assume only 1 converter can match any source key)
     source_pattern = None
-    for converter in weight_converters:
-        renamed_key, source_pattern = converter.rename_source_key(renamed_key)
-        if source_pattern is not None:
-            break
 
-    # 3. check if we need to add or remove prefix if necessary (only during loading, not saving)
+    for transform in weight_transforms:
+        if isinstance(transform, WeightConverter):
+            if source_pattern is not None:
+                # Already matched a converter; skip subsequent converters.
+                continue
+            renamed_key, sp = transform.rename_source_key(renamed_key)
+            if sp is not None:
+                source_pattern = sp
+        else:
+            renamed_key, _ = transform.rename_source_key(renamed_key)
+
+    # check if we need to add or remove prefix if necessary (only during loading, not saving)
     if prefix is not None and meta_state_dict is not None:
         if (
             renamed_key.startswith(prefix)
@@ -1277,7 +1323,6 @@ def convert_and_load_state_dict_in_model(
     else:
         thread_pool = ThreadPoolExecutor(max_workers=GLOBAL_WORKERS)
 
-    renamings = [entry for entry in weight_mapping if isinstance(entry, WeightRenaming)]
     converters = [entry for entry in weight_mapping if isinstance(entry, WeightConverter)]
     param_name_to_load: dict[str, WeightRenaming | WeightConverter] = {}
 
@@ -1292,13 +1337,11 @@ def convert_and_load_state_dict_in_model(
 
     state_dict = sorted(state_dict.items(), key=lambda kv: dot_natural_key(kv[0]))
     for original_key, tensor in state_dict:
-        # 1. Rename the key according to all renaming pattern and optional weight converter patterns
-        renamed_key, source_pattern = rename_source_key(
-            original_key, renamings, converters, prefix, meta_model_state_dict
-        )
+        # 1. Rename the key according to all renaming and weight conversion patterns.
+        renamed_key, source_pattern = rename_source_key(original_key, weight_mapping, prefix, meta_model_state_dict)
         if renamed_key not in meta_model_state_dict and original_key in meta_model_state_dict:
-            # Key should probably not have been renamed but we might need the `prefix` to be added.`
-            renamed_key, source_pattern = rename_source_key(original_key, [], [], prefix, meta_model_state_dict)
+            # Key should probably not have been renamed but we might need the `prefix` to be added.
+            renamed_key, source_pattern = rename_source_key(original_key, [], prefix, meta_model_state_dict)
 
         # 2. finally, collect the tensor into the proper converter
         if renamed_key in meta_model_state_dict:
@@ -1460,15 +1503,14 @@ def revert_weight_conversion(model: PreTrainedModel, state_dict: dict[str, torch
     # Reverse all Transform to correctly match keys
     reverse_weight_conversion = [conversion.reverse_transform() for conversion in weight_conversions]
     # If we are still here, we need to create the (reverse) conversion mapping from scratch
-    renamings = [entry for entry in reverse_weight_conversion if isinstance(entry, WeightRenaming)]
     converters = [entry for entry in reverse_weight_conversion if isinstance(entry, WeightConverter)]
     pattern_to_converter = {k: converter for converter in converters for k in converter.source_patterns}
     conversion_mapping = {}
 
     state_dict = sorted(state_dict.items(), key=lambda kv: dot_natural_key(kv[0]))
     for original_key, tensor in state_dict:
-        # Rename the key according to all renaming pattern and optional weight converter patterns
-        renamed_key, source_pattern = rename_source_key(original_key, renamings, converters)
+        renamed_key, source_pattern = rename_source_key(original_key, reverse_weight_conversion)
+
         if source_pattern is not None:
             new_converter = deepcopy(pattern_to_converter[source_pattern])
             # each target key gets its own converter instance
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 8412ab5ae25a..aea74cee059b 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -1369,6 +1369,7 @@ class DataCollatorWithFlattening(DefaultDataCollator):
     - uses `separator_id` to separate sequences within the concatenated `labels`, default value is -100
     - no padding will be added, returns `input_ids`, `labels` and `position_ids` by default
     - optionally returns the kwargs contained in FlashAttentionKwargs
+    - optionally returns `cu_seqlens` for FLA-style kernels
     - optionally returns seq_idx indicating which sequence each token belongs to
 
     <Tip warning={true}>
@@ -1385,6 +1386,7 @@ def __init__(
         return_position_ids=True,
         separator_id=-100,
         return_flash_attn_kwargs=False,
+        return_cu_seqlens=False,
         return_seq_idx=False,
         **kwargs,
     ):
@@ -1392,6 +1394,7 @@ def __init__(
         self.return_position_ids = return_position_ids
         self.separator_id = separator_id
         self.return_flash_attn_kwargs = return_flash_attn_kwargs
+        self.return_cu_seqlens = return_cu_seqlens
         self.return_seq_idx = return_seq_idx
         self._int_64_keys = {"labels", "position_ids", "input_ids"}
         self._batch_dim_keys = {"labels", "position_ids", "input_ids", "seq_idx"}
@@ -1408,7 +1411,7 @@ def __call__(self, features, return_tensors=None, separator_id=None):
             batch.update({"position_ids": []})
         if self.return_seq_idx:
             batch.update({"seq_idx": []})
-        if self.return_flash_attn_kwargs:
+        if self.return_flash_attn_kwargs or self.return_cu_seqlens:
             cu_seq_lens = [0]
             max_length = 0
         for seq_idx, sample in enumerate(features):
@@ -1423,20 +1426,25 @@ def __call__(self, features, return_tensors=None, separator_id=None):
                 # Convert to list if tensor
                 if hasattr(labels, "tolist"):
                     labels = labels.tolist()
-                batch["labels"] += [separator_id] + labels[1:]
+                if isinstance(labels, (list, tuple)):
+                    batch["labels"] += [separator_id] + labels[1:]
+                else:
+                    batch["labels"] += [labels] * len(input_ids)
             else:
                 batch["labels"] += [separator_id] + input_ids[1:]
             if self.return_position_ids:
                 batch["position_ids"] += list(range(len(input_ids)))
             if self.return_seq_idx:
                 batch["seq_idx"] += [seq_idx for _ in range(len(input_ids))]
-            if self.return_flash_attn_kwargs:
+            if self.return_flash_attn_kwargs or self.return_cu_seqlens:
                 cu_seq_lens.append(cu_seq_lens[-1] + len(input_ids))
                 max_length = max(max_length, len(input_ids))
 
         if self.return_flash_attn_kwargs:
             batch["cu_seq_lens_q"] = batch["cu_seq_lens_k"] = cu_seq_lens
             batch["max_length_q"] = batch["max_length_k"] = max_length
+        if self.return_cu_seqlens:
+            batch["cu_seqlens"] = cu_seq_lens
 
         # FlashAttentionKwargs and seq_idx are expected to be int32s.
         if return_tensors == "pt":
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 38ff0399641b..ae44ef1eb899 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -155,7 +155,7 @@ def __init__(self, model, max_frames_to_save=21, trace_batch_nums=None, abort_af
         self.batch_number = 0
         self.total_calls = 0
         self.detected_overflow = False
-        self.prefix = "                 "
+        self.prefix = " " * 17
 
         self.analyse_model()
 
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 9c9e7b929f6f..4598a6760090 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -311,6 +311,42 @@ def get_class_in_module(
         return getattr(module, class_name)
 
 
+def _compute_local_source_files_hash(
+    pretrained_model_name_or_path: str | os.PathLike,
+    module_file: str | os.PathLike,
+    resolved_module_file: str | os.PathLike,
+    modules_needed: list[str],
+) -> str:
+    """
+    Computes a stable hash from the bytes of the local source file and its direct relative-import source files.
+    """
+    model_path = Path(pretrained_model_name_or_path).resolve()
+    module_parent = Path(module_file).parent
+
+    resolved_module_file = Path(resolved_module_file).resolve()
+
+    def _resolve_relative_source_path(source_file_path: Path) -> str:
+        try:
+            return source_file_path.relative_to(model_path).as_posix()
+        except ValueError:
+            # Fallback for edge cases where the source file is not under the local model directory.
+            return source_file_path.as_posix()
+
+    files_to_hash = [
+        (_resolve_relative_source_path(resolved_module_file), resolved_module_file),
+    ]
+    for module_needed in modules_needed:
+        module_needed_path = (model_path / module_parent / f"{module_needed}.py").resolve()
+        files_to_hash.append((_resolve_relative_source_path(module_needed_path), module_needed_path))
+
+    source_files_hash = hashlib.sha256()
+    for relative_path, file_path in sorted(files_to_hash, key=lambda entry: entry[0]):
+        source_files_hash.update(relative_path.encode("utf-8"))
+        source_files_hash.update(file_path.read_bytes())
+
+    return source_files_hash.hexdigest()[:16]
+
+
 def get_cached_module_file(
     pretrained_model_name_or_path: str | os.PathLike,
     module_file: str,
@@ -374,11 +410,10 @@ def get_cached_module_file(
         local_files_only = True
 
     # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
-    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path).rstrip(os.sep)
     is_local = os.path.isdir(pretrained_model_name_or_path)
-    if is_local:
-        submodule = _sanitize_module_name(os.path.basename(pretrained_model_name_or_path))
-    else:
+    cached_module = None
+    if not is_local:
         submodule = os.path.sep.join(map(_sanitize_module_name, pretrained_model_name_or_path.split("/")))
         cached_module = try_to_load_from_cache(
             pretrained_model_name_or_path, module_file, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
@@ -408,19 +443,28 @@ def get_cached_module_file(
 
     # Check we have all the requirements in our environment
     modules_needed = check_imports(resolved_module_file)
+    if is_local:
+        local_model_name = _sanitize_module_name(os.path.basename(os.path.normpath(pretrained_model_name_or_path)))
+        local_source_files_hash = _compute_local_source_files_hash(
+            pretrained_model_name_or_path, module_file, resolved_module_file, modules_needed
+        )
+        if local_model_name:
+            submodule = os.path.sep.join([local_model_name, local_source_files_hash])
+        else:
+            submodule = local_source_files_hash
 
     # Now we move the module inside our cached dynamic modules.
     full_submodule = TRANSFORMERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
     create_dynamic_module(full_submodule)
     submodule_path = Path(HF_MODULES_CACHE) / full_submodule
-    if submodule == _sanitize_module_name(os.path.basename(pretrained_model_name_or_path)):
+    if is_local:
         # We copy local files to avoid putting too many folders in sys.path. This copy is done when the file is new or
         # has changed since last copy.
         if not (submodule_path / module_file).exists() or not filecmp.cmp(
             resolved_module_file, str(submodule_path / module_file)
         ):
             (submodule_path / module_file).parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy(resolved_module_file, submodule_path / module_file)
+            shutil.copyfile(resolved_module_file, submodule_path / module_file)
             importlib.invalidate_caches()
         for module_needed in modules_needed:
             module_needed = Path(module_file).parent / f"{module_needed}.py"
@@ -428,7 +472,7 @@ def get_cached_module_file(
             if not (submodule_path / module_needed).exists() or not filecmp.cmp(
                 module_needed_file, str(submodule_path / module_needed)
             ):
-                shutil.copy(module_needed_file, submodule_path / module_needed)
+                shutil.copyfile(module_needed_file, submodule_path / module_needed)
                 importlib.invalidate_caches()
     else:
         # Get the commit hash
@@ -442,7 +486,7 @@ def get_cached_module_file(
         create_dynamic_module(Path(full_submodule_module_file_path).parent)
 
         if not (submodule_path / module_file).exists():
-            shutil.copy(resolved_module_file, submodule_path / module_file)
+            shutil.copyfile(resolved_module_file, submodule_path / module_file)
             importlib.invalidate_caches()
         # Make sure we also have every file with relative
         for module_needed in modules_needed:
@@ -647,13 +691,13 @@ def _set_auto_map_in_config(_config):
     # Copy module file to the output folder.
     object_file = sys.modules[obj.__module__].__file__
     dest_file = Path(folder) / (Path(object_file).name)
-    shutil.copy(object_file, dest_file)
+    shutil.copyfile(object_file, dest_file)
     result.append(dest_file)
 
     # Gather all relative imports recursively and make sure they are copied as well.
     for needed_file in get_relative_import_files(object_file):
         dest_file = Path(folder) / (Path(needed_file).name)
-        shutil.copy(needed_file, dest_file)
+        shutil.copyfile(needed_file, dest_file)
         result.append(dest_file)
 
     return result
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index f69b3fdfd9b0..e9840a1fd3a1 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -32,6 +32,8 @@
     TensorType,
     _is_tensor_or_array_like,
     copy_func,
+    is_mlx_array,
+    is_mlx_available,
     is_numpy_array,
     is_torch_available,
     is_torch_device,
@@ -142,6 +144,26 @@ def as_tensor(value):
                     return torch.tensor(value)
 
             is_tensor = torch.is_tensor
+
+        elif tensor_type == TensorType.MLX:
+            if not is_mlx_available():
+                raise ImportError("Unable to convert output to MLX tensors format, MLX is not installed.")
+            import mlx.core as mx
+
+            def as_tensor(value):
+                if isinstance(value, (list, tuple)) and len(value) > 0:
+                    if isinstance(value[0], np.ndarray):
+                        value = np.array(value)
+                    elif (
+                        isinstance(value[0], (list, tuple))
+                        and len(value[0]) > 0
+                        and isinstance(value[0][0], np.ndarray)
+                    ):
+                        value = np.array(value)
+                return mx.array(value)
+
+            is_tensor = is_mlx_array
+
         else:
 
             def as_tensor(value, dtype=None):
diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py
index 459dcfc1c2fa..6121b57909b8 100644
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@@ -943,7 +943,6 @@ def cancel_request(self, request_id: str) -> None:
         if self.batch_processor is not None:
             self.batch_processor.scheduler.set_request_cancellation(request_id)
 
-    # TODO:handle benchmarking properly when updating / fixing the requeue logic
     def get_result(self, request_id: str | None = None, timeout: float | None = None) -> GenerationOutput | None:
         """Retrieve one result from the output queue.
 
@@ -956,14 +955,28 @@ def get_result(self, request_id: str | None = None, timeout: float | None = None
         """
         if self._generation_thread is None and self.output_router.output_queue.empty():
             return None
+
+        deadline = None if timeout is None else perf_counter() + timeout
+        deferred: list[GenerationOutput] = []
+
         try:
-            result = self.output_router.output_queue.get(block=True, timeout=timeout)
-            if request_id is not None and result.request_id != request_id:
-                self.output_router.output_queue.put(result)
-                return None
-            return result
-        except queue.Empty:
-            return None
+            while True:
+                remaining = None if deadline is None else max(0.0, deadline - perf_counter())
+                if remaining == 0.0:
+                    return None
+
+                try:
+                    result = self.output_router.output_queue.get(timeout=remaining)
+                except queue.Empty:
+                    return None
+
+                if request_id is None or result.request_id == request_id:
+                    return result
+
+                deferred.append(result)
+        finally:
+            for item in deferred:
+                self.output_router.output_queue.put(item)
 
     def __iter__(self):
         """Iterate over results as they become available."""
@@ -980,11 +993,16 @@ def request_id_iter(self, request_id: str) -> Generator[GenerationOutput]:
         """
         while self._generation_thread is not None and self._generation_thread.is_alive():
             result = self.get_result(request_id=request_id, timeout=0.1)
+
             if result is not None:
                 yield result
                 if result.is_finished():
                     return
 
+            if self.batch_processor is not None:
+                if self.batch_processor.scheduler.request_is_cancelled(request_id):
+                    return
+
     def register_result_handler(self, request_id: str, callback: Callable) -> None:
         """Register a callback for result delivery (streaming or non-streaming).
 
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 9c47e551cee8..3d0c70dd7413 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1005,7 +1005,14 @@ def __init__(
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isneginf(scores).all(dim=-1).any():
+            raise ValueError(
+                "EtaLogitsWarper received a row with all logits set to -inf. "
+                "This usually means previous logits processors masked every token."
+            )
+
         probabilities = scores.softmax(dim=-1)
+
         entropy = torch.distributions.Categorical(logits=scores).entropy()
         eta = torch.min(self.epsilon, torch.sqrt(self.epsilon) * torch.exp(-entropy))[..., None]
         indices_to_remove = probabilities < eta
@@ -1661,13 +1668,22 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class InfNanRemoveLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation method to fail. Note that using
-    the logits processor should only be used if necessary since it can slow down the generation method.
+    [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation method to fail. This version
+    has been extended to sanitize both logits and hidden state output tensors to handle instabilities in very wide
+    models or ones sharded across many devices.
+
+    Note that using the logits processor should only be used if necessary since it can slow down the generation method.
 
     This logits processor has no `generate` example, as there shouldn't be a correct combination of flags that warrants
-    its use.
+    its use. However, when dealing with sharded models across many GPUs or models with very wide hidden dimensions that
+    can produce unstable values, setting `remove_invalid_values=True` in generation config will activate this processor
+    automatically.
     """
 
+    def __init__(self, hidden_states_aware=True):
+        # Flag to control whether we also want to clean hidden states
+        self.hidden_states_aware = hidden_states_aware
+
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # set all nan values to 0.0
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 388cef73566a..16aa5a7ff17e 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1086,9 +1086,31 @@ def _get_logits_processor(
                     UserWarning,
                 )
         if generation_config.repetition_penalty is not None and generation_config.repetition_penalty != 1.0:
-            processors.append(RepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
+            if self.config.is_encoder_decoder:
+                processors.append(RepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
+            else:
+                inputs_embeds = model_kwargs.get("inputs_embeds") if model_kwargs is not None else None
+                if inputs_embeds is not None and (input_ids_seq_length is None or input_ids_seq_length == 0):
+                    warnings.warn(
+                        "Passing `repetition_penalty` requires some form of `input_ids` to be passed to "
+                        "`generate`, ignoring the argument.",
+                        UserWarning,
+                    )
+                else:
+                    processors.append(RepetitionPenaltyLogitsProcessor(penalty=generation_config.repetition_penalty))
         if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0:
-            processors.append(NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
+            if self.config.is_encoder_decoder:
+                processors.append(NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
+            else:
+                inputs_embeds = model_kwargs.get("inputs_embeds") if model_kwargs is not None else None
+                if inputs_embeds is not None and (input_ids_seq_length is None or input_ids_seq_length == 0):
+                    warnings.warn(
+                        "Passing `no_repeat_ngram_size` requires some form of `input_ids` to be passed to "
+                        "`generate`, ignoring the argument.",
+                        UserWarning,
+                    )
+                else:
+                    processors.append(NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size))
         if (
             generation_config.encoder_no_repeat_ngram_size is not None
             and generation_config.encoder_no_repeat_ngram_size > 0
@@ -1720,12 +1742,50 @@ def _prepare_generation_config(
                 "parameters explicitly, but not both.",
             )
 
+        # Safety: if the model is sharded across multiple devices (hf_device_map/device_map) and we are
+        # doing sampling, enable `remove_invalid_values` by default to avoid NaN/Inf logits causing CUDA
+        # asserts during multinomial sampling. Users can still override this by passing the flag explicitly.
+        try:
+            is_sharded_map = False
+            hf_map = getattr(self, "hf_device_map", None)
+            if hf_map is not None and isinstance(hf_map, dict) and len(set(hf_map.values())) > 1:
+                # consider sharded if more than one device (excluding "cpu"/"disk")
+                devices = set(hf_map.values())
+                gpu_devices = {d for d in devices if d not in {"cpu", "disk"}}
+                if len(gpu_devices) > 1:
+                    is_sharded_map = True
+
+            # also accept legacy `device_map` attribute or accelerate hooks
+            device_map_attr = getattr(self, "device_map", None)
+            if not is_sharded_map and device_map_attr is not None:
+                # device_map can be a dict mapping module->device or other structures; if it's a dict and maps
+                # to multiple cuda devices, consider it sharded
+                if isinstance(device_map_attr, dict) and len(set(device_map_attr.values())) > 1:
+                    devices = set(device_map_attr.values())
+                    gpu_devices = {d for d in devices if d not in {"cpu", "disk"}}
+                    if len(gpu_devices) > 1:
+                        is_sharded_map = True
+
+            if is_sharded_map and generation_config.do_sample and generation_config.remove_invalid_values is False:
+                generation_config.remove_invalid_values = True
+                logger.info(
+                    "Enabling `remove_invalid_values=True` for sharded sampling to avoid NaN/Inf logits during sampling."
+                )
+        except Exception as exception:
+            # never fail generation config preparation due to best-effort safety check
+            logger.debug("Failed to detect sharded generation setup: %s", exception)
         # Finally keep output_xxx args in `model_kwargs` so it can be passed to `forward`
         output_attentions = generation_config.output_attentions
         output_hidden_states = generation_config.output_hidden_states
         model_kwargs.update({"output_attentions": output_attentions} if output_attentions else {})
         model_kwargs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
 
+        # Enforce deterministic greedy decoding if do_sample=False and num_beams = 1
+        if generation_config.do_sample is False and generation_config.num_beams == 1:
+            generation_config.temperature = 1.0
+            generation_config.top_k = 0
+            generation_config.top_p = 1.0
+
         return generation_config, model_kwargs
 
     def _prepare_static_cache(
@@ -1993,6 +2053,10 @@ def _tensor_or_none(token, device=None):
         generation_config._pad_token_tensor = pad_token_tensor
         generation_config._decoder_start_token_tensor = decoder_start_token_tensor
 
+    def _is_dynamo_compilation_disabled(self) -> bool:
+        """Check standard environment variables that explicitly disable torch.dynamo compilation."""
+        return os.getenv("TORCHDYNAMO_DISABLE", "").lower() in {"1", "true", "yes", "on"}
+
     def _valid_auto_compile_criteria(
         self: "GenerativePreTrainedModel", model_kwargs: dict[str, Any], generation_config: GenerationConfig
     ) -> bool:
@@ -2003,6 +2067,9 @@ def _valid_auto_compile_criteria(
         if generation_config.disable_compile:
             return False
 
+        if self._is_dynamo_compilation_disabled():
+            return False
+
         cache = model_kwargs.get("past_key_values", model_kwargs.get("cache_params"))
 
         # Base logic
@@ -2969,9 +3036,17 @@ def _get_top_k_continuations(
 
         # Gather the top K scores from _all_ beams.
         if do_sample:
-            topk_indices = torch.multinomial(
-                nn.functional.softmax(accumulated_log_probs, dim=-1), num_samples=beams_to_keep
-            )
+            # Handle potential NaN values in accumulated_log_probs
+            probs = nn.functional.softmax(accumulated_log_probs, dim=-1)
+            # Replace NaN values with uniform distribution
+            if torch.isnan(probs).any():
+                # Create a mask for NaN positions
+                nan_mask = torch.isnan(probs)
+                # Replace NaN with a small uniform probability
+                probs = torch.where(nan_mask, torch.ones_like(probs) / probs.shape[-1], probs)
+                # Renormalize to ensure probabilities sum to 1
+                probs = probs / probs.sum(dim=-1, keepdim=True)
+            topk_indices = torch.multinomial(probs, num_samples=beams_to_keep)
             topk_log_probs = torch.gather(input=accumulated_log_probs, dim=1, index=topk_indices)
         else:
             topk_log_probs, topk_indices = torch.topk(accumulated_log_probs, k=beams_to_keep)
diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
index b9e6f99b041d..32def64950b5 100644
--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -175,10 +175,19 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
                     " the argument parser only supports one type per argument."
                     f" Problem encountered in field '{field.name}'."
                 )
+            # filter `dict` in Union because argparse does not support it
+            if dict in field.type.__args__:
+                remaining_types = tuple(arg for arg in field.type.__args__ if arg is not dict)
+                field.type = remaining_types[0]
+                for remaining_type in remaining_types[1:]:
+                    field.type |= remaining_type
             if type(None) not in field.type.__args__:
-                # filter `str` in Union
-                field.type = field.type.__args__[0] if field.type.__args__[1] is str else field.type.__args__[1]
-                origin_type = getattr(field.type, "__origin__", field.type)
+                if len(field.type.__args__) > 2:
+                    origin_type = str
+                else:
+                    # filter `str` in Union
+                    field.type = field.type.__args__[0] if field.type.__args__[1] is str else field.type.__args__[1]
+                    origin_type = getattr(field.type, "__origin__", field.type)
             elif bool not in field.type.__args__:
                 # filter `NoneType` in Union (except for `Union[bool, NoneType]`)
                 field.type = (
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 704001c476a6..74069f93aff6 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -675,6 +675,10 @@ def get_patch_output_size(image, target_resolution, input_data_format):
     original_height, original_width = get_image_size(image, channel_dim=input_data_format)
     target_height, target_width = target_resolution
 
+    if original_width == 0:
+        raise ValueError("original_width can not be 0")
+    if original_height == 0:
+        raise ValueError("original_height can not be 0")
     scale_w = target_width / original_width
     scale_h = target_height / original_height
 
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 88160d1bced3..9ea4bfed897e 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -15,7 +15,7 @@
 from collections import defaultdict
 from collections.abc import Collection, Iterable
 from math import ceil
-from typing import Optional, Union
+from typing import Any, Optional, Union, overload
 
 import numpy as np
 
@@ -26,7 +26,7 @@
     get_image_size,
     infer_channel_dimension_format,
 )
-from .utils import ExplicitEnum, TensorType, is_torch_tensor
+from .utils import ExplicitEnum, is_torch_tensor
 from .utils.import_utils import (
     is_torch_available,
     is_vision_available,
@@ -547,7 +547,15 @@ def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
 
 
 # 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
-def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
+@overload
+def center_to_corners_format(bboxes_center: "torch.Tensor") -> "torch.Tensor": ...
+
+
+@overload
+def center_to_corners_format(bboxes_center: np.ndarray) -> np.ndarray: ...
+
+
+def center_to_corners_format(bboxes_center: "torch.Tensor | np.ndarray") -> Any:
     """
     Converts bounding boxes from center format to corners format.
 
@@ -590,7 +598,15 @@ def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
     return bboxes_center
 
 
-def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
+@overload
+def corners_to_center_format(bboxes_corners: "torch.Tensor") -> "torch.Tensor": ...
+
+
+@overload
+def corners_to_center_format(bboxes_corners: np.ndarray) -> np.ndarray: ...
+
+
+def corners_to_center_format(bboxes_corners: "torch.Tensor | np.ndarray") -> Any:
     """
     Converts bounding boxes from corners format to center format.
 
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 984d80964fad..8ed1ab73af1f 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -17,6 +17,7 @@
 from collections.abc import Iterable
 from dataclasses import dataclass, fields
 from io import BytesIO
+from pathlib import Path
 from typing import Any, Union
 
 import httpx
@@ -463,14 +464,14 @@ def valid_coco_panoptic_annotations(annotations: Iterable[dict[str, list | tuple
 
 
 def load_image(
-    image: Union[str, "PIL.Image.Image"],
+    image: Union[str, Path, "PIL.Image.Image"],
     timeout: float | None = None,
 ) -> "PIL.Image.Image":
     """
     Loads `image` to a PIL Image.
 
     Args:
-        image (`str` or `PIL.Image.Image`):
+        image (`str`, `Path` or `PIL.Image.Image`):
             The image to convert to the PIL Image format.
         timeout (`float`, *optional*):
             The timeout value in seconds for the URL request.
@@ -479,6 +480,9 @@ def load_image(
         `PIL.Image.Image`: A PIL Image.
     """
     requires_backends(load_image, ["vision"])
+    if isinstance(image, Path):
+        image = str(image)
+
     if isinstance(image, str):
         if image.startswith("http://") or image.startswith("https://"):
             # We need to actually check for a real protocol, otherwise it's impossible to use a local file
diff --git a/src/transformers/initialization.py b/src/transformers/initialization.py
index b0ebb053086b..28072ba3b022 100644
--- a/src/transformers/initialization.py
+++ b/src/transformers/initialization.py
@@ -15,6 +15,7 @@
 import sys
 from collections import defaultdict
 from contextlib import contextmanager
+from contextvars import ContextVar
 
 import torch
 
@@ -38,6 +39,19 @@
     "sparse_": torch.nn.init.sparse_,
 }
 
+# Track the current no-tie scope per execution context so concurrent model loads
+# do not leak tie_weights suppression across threads.
+_SKIP_TIE_WEIGHTS_SCOPE: ContextVar[object | None] = ContextVar("_SKIP_TIE_WEIGHTS_SCOPE", default=None)
+
+
+def should_skip_tie_weights(model) -> bool:
+    scope = _SKIP_TIE_WEIGHTS_SCOPE.get()
+    if scope is None:
+        return False
+
+    # Only skip tying for the model instance created inside the active scope.
+    return getattr(model, "_skip_tie_weights_scope", None) is scope
+
 
 def uniform_(
     tensor: torch.Tensor, a: float = 0.0, b: float = 1.0, generator: torch.Generator | None = None
@@ -287,19 +301,13 @@ def no_tie_weights():
     weights in the state_dict during `from_pretrained`, and otherwise tying them would remove them from it, as it's
     called in `post_init` when instantiating.
     """
-    from .modeling_utils import PreTrainedModel
-
-    def empty_func(*args, **kwargs):
-        pass
-
+    # Use an opaque scope token so nested or concurrent loads can identify only
+    # the models instantiated under this context manager.
+    state_token = _SKIP_TIE_WEIGHTS_SCOPE.set(object())
     try:
-        original_tie_weights = PreTrainedModel.tie_weights
-        PreTrainedModel.tie_weights = empty_func
-
         yield
     finally:
-        # Set back the original
-        PreTrainedModel.tie_weights = original_tie_weights
+        _SKIP_TIE_WEIGHTS_SCOPE.reset(state_token)
 
 
 @contextmanager
diff --git a/src/transformers/integrations/accelerate.py b/src/transformers/integrations/accelerate.py
index c2b7fa603570..d7a1e4808f30 100644
--- a/src/transformers/integrations/accelerate.py
+++ b/src/transformers/integrations/accelerate.py
@@ -399,7 +399,12 @@ def accelerate_dispatch(model, hf_quantizer, device_map, offload_folder, offload
     ):
         device_map_kwargs["offload_buffers"] = True
 
-    if not is_fsdp_enabled() and not is_deepspeed_zero3_enabled():
+    is_quantized_bnb = (
+        hf_quantizer is not None
+        and hf_quantizer.quantization_config.quant_method == QuantizationMethod.BITS_AND_BYTES
+    )
+
+    if not is_fsdp_enabled() and not is_deepspeed_zero3_enabled() and not is_quantized_bnb:
         dispatch_model(model, **device_map_kwargs)
 
 
@@ -446,15 +451,13 @@ def accelerate_disk_offload(
     renamed) will be mapped to where they already reside on disk. Otherwise, the parameters will be resaved inside
     `disk_offload_folder` during loading.
     """
-    from ..core_model_loading import WeightRenaming, rename_source_key
+    from ..core_model_loading import rename_source_key
 
     if disk_offload_folder is not None:
         os.makedirs(disk_offload_folder, exist_ok=True)
     is_offloaded_safetensors = checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors")
 
-    renamings = []
-    if weight_mapping is not None:
-        renamings = [entry for entry in weight_mapping if isinstance(entry, WeightRenaming)]
+    transforms = weight_mapping if weight_mapping is not None else []
 
     # In this case, the offload index is simply the existing safetensors (except if using custom weight loading
     # Operation, e.g. the MoE models, where we need to resave the weights that were changed at loading time)
@@ -470,7 +473,7 @@ def accelerate_disk_offload(
 
         # Update the weight names according to the `weight_mapping`
         weight_renaming_map = {
-            rename_source_key(k, renamings, [], model.base_model_prefix, meta_state_dict)[0]: k for k in weight_map
+            rename_source_key(k, transforms, model.base_model_prefix, meta_state_dict)[0]: k for k in weight_map
         }
 
         # Prepare the index using existing safetensors files
diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py
index 9703f642f8bc..79f3896cb48c 100644
--- a/src/transformers/integrations/deepspeed.py
+++ b/src/transformers/integrations/deepspeed.py
@@ -347,7 +347,7 @@ def _apply_weight_conversions_to_state_dict(model, state_dict, weight_mapping):
                 "in your DeepSpeed config or convert your checkpoint to the expected format first."
             )
 
-    from ..core_model_loading import WeightConverter, WeightRenaming, dot_natural_key, rename_source_key
+    from ..core_model_loading import WeightConverter, dot_natural_key, rename_source_key
 
     # Preserve metadata from the original state dict
     metadata = getattr(state_dict, "_metadata", None)
@@ -360,14 +360,13 @@ def _apply_weight_conversions_to_state_dict(model, state_dict, weight_mapping):
     for key, param in model.state_dict().items():
         model_state_dict[key] = torch.empty(param.shape, dtype=param.dtype, device="meta")
 
-    renamings = [entry for entry in weight_mapping if isinstance(entry, WeightRenaming)]
     converters = [entry for entry in weight_mapping if isinstance(entry, WeightConverter)]
 
     # Fast path: if we only have simple renamings and no converters, we can skip the expensive collection logic
     if len(converters) == 0:
         new_state_dict = {}
         for original_key, tensor in state_dict.items():
-            renamed_key, _ = rename_source_key(original_key, renamings, [], prefix, model_state_dict)
+            renamed_key, _ = rename_source_key(original_key, weight_mapping, prefix, model_state_dict)
             if renamed_key in model_state_dict:
                 new_state_dict[renamed_key] = tensor
         # Attach metadata to the new state dict
@@ -386,7 +385,7 @@ def _apply_weight_conversions_to_state_dict(model, state_dict, weight_mapping):
     sorted_keys = sorted(state_dict.keys(), key=lambda k: dot_natural_key(k))
     for original_key in sorted_keys:
         tensor = state_dict.pop(original_key)
-        renamed_key, source_pattern = rename_source_key(original_key, renamings, converters, prefix, model_state_dict)
+        renamed_key, source_pattern = rename_source_key(original_key, weight_mapping, prefix, model_state_dict)
 
         # Only process if the renamed key is in the model's state dict
         if renamed_key in model_state_dict:
diff --git a/src/transformers/integrations/dsa_kernels.py b/src/transformers/integrations/dsa_kernels.py
new file mode 100644
index 000000000000..aa7498a387be
--- /dev/null
+++ b/src/transformers/integrations/dsa_kernels.py
@@ -0,0 +1,479 @@
+import torch
+
+from ..utils import logging as transformers_logging
+
+
+logger = transformers_logging.get_logger(__name__)
+
+# Try to import tilelang for accelerated kernels
+_tilelang_available = False
+try:
+    import tilelang
+    import tilelang.language as T
+
+    tilelang.set_log_level("WARNING")
+
+    pass_configs = {
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True,
+    }
+    _tilelang_available = True
+except Exception:
+    T = None
+
+FP8 = "float8_e4m3"
+BF16 = "bfloat16"
+FP32 = "float32"
+
+
+# ---- TileLang kernel definitions (only if tilelang is available) ----
+if _tilelang_available:
+
+    def fast_log2_ceil(x):
+        bits_x = T.reinterpret("uint32", x)
+        exp_x = (bits_x >> 23) & 0xFF
+        man_bits = bits_x & ((1 << 23) - 1)
+        return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
+
+    def fast_pow2(x):
+        bits_x = (x + 127) << 23
+        return T.reinterpret("float32", bits_x)
+
+    def fast_round_scale(amax, fp8_max_inv):
+        return fast_pow2(fast_log2_ceil(amax * fp8_max_inv))
+
+    @tilelang.jit(pass_configs=pass_configs)
+    def act_quant_kernel(N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False):
+        M = T.symbolic("M")
+        fp8_min = -448.0
+        fp8_max = 448.0
+        fp8_max_inv = 1 / fp8_max
+        num_stages = 0 if round_scale else 2
+        blk_m = 32
+        group_size = 128
+
+        @T.prim_func
+        def act_quant_kernel_(
+            X: T.Tensor[(M, N), in_dtype],
+            Y: T.Tensor[(M, N), out_dtype],
+            S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype],
+        ):
+            with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (
+                pid_m,
+                pid_n,
+            ):
+                x_shared = T.alloc_shared((blk_m, group_size), in_dtype)
+                x_local = T.alloc_fragment((blk_m, group_size), in_dtype)
+                amax_local = T.alloc_fragment((blk_m,), scale_dtype)
+                s_local = T.alloc_fragment((blk_m,), scale_dtype)
+                y_local = T.alloc_fragment((blk_m, group_size), out_dtype)
+                y_shared = T.alloc_shared((blk_m, group_size), out_dtype)
+
+                for _ in T.Pipelined(1, num_stages=num_stages):
+                    T.copy(X[pid_m * blk_m, pid_n * group_size], x_shared)
+                    T.copy(x_shared, x_local)
+                    T.reduce_absmax(x_local, amax_local, dim=1)
+                    for i in T.Parallel(blk_m):
+                        amax_local[i] = T.max(amax_local[i], 1e-4)
+                        if round_scale:
+                            s_local[i] = fast_round_scale(amax_local[i], fp8_max_inv)
+                        else:
+                            s_local[i] = amax_local[i] * fp8_max_inv
+                    for i, j in T.Parallel(blk_m, group_size):
+                        y_local[i, j] = T.clamp(x_local[i, j] / s_local[i], fp8_min, fp8_max)
+                    for i in T.Parallel(blk_m):
+                        S[pid_m * blk_m + i, pid_n] = s_local[i]
+                    T.copy(y_local, y_shared)
+                    T.copy(y_shared, Y[pid_m * blk_m, pid_n * group_size])
+
+        return act_quant_kernel_
+
+    @tilelang.jit(pass_configs=pass_configs)
+    def fp8_gemm_kernel(N, K, out_dtype=BF16, accum_dtype="float32"):
+        assert out_dtype in [BF16, "float32"]
+
+        M = T.symbolic("M")
+        group_size = 128
+        block_M = 32
+        block_N = 128
+        block_K = 128
+
+        @T.prim_func
+        def fp8_gemm_kernel_(
+            A: T.Tensor[(M, K), FP8],
+            B: T.Tensor[(N, K), FP8],
+            C: T.Tensor[(M, N), out_dtype],
+            scales_a: T.Tensor[(M, T.ceildiv(K, group_size)), FP32],
+            scales_b: T.Tensor[(T.ceildiv(N, group_size), T.ceildiv(K, group_size)), FP32],
+        ):
+            with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
+                bx,
+                by,
+            ):
+                A_shared = T.alloc_shared((block_M, block_K), FP8)
+                B_shared = T.alloc_shared((block_N, block_K), FP8)
+                C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+                Scale_C_shared = T.alloc_shared((block_M), FP32)
+                C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+                C_local_accum = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+                # Improve L2 Cache
+                T.use_swizzle(panel_size=10)
+
+                T.clear(C_local)
+                T.clear(C_local_accum)
+                K_iters = T.ceildiv(K, block_K)
+                for k in T.Pipelined(K_iters, num_stages=4):
+                    # Load A into shared memory
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                    # Load B into shared memory
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                    # Load scale into shared memory
+                    Scale_B = scales_b[bx * block_N // group_size, k]
+                    for i in T.Parallel(block_M):
+                        Scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B
+
+                    T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+                    # Promote to enable 2xAcc
+                    for i, j in T.Parallel(block_M, block_N):
+                        C_local_accum[i, j] += C_local[i, j] * Scale_C_shared[i]
+                    T.clear(C_local)
+            # TMA store
+            T.copy(C_local_accum, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+        return fp8_gemm_kernel_
+
+    @tilelang.jit(out_idx=[4], pass_configs=pass_configs)
+    def fp8_index_kernel(h: int, d: int):
+        b = T.symbolic("b")
+        m = T.symbolic("m")
+        n = T.symbolic("n")
+
+        blk_n1 = 512
+        blk_n2 = 128
+
+        @T.prim_func
+        def fp8_index_kernel_(
+            q: T.Tensor[(b, m, h, d), FP8],
+            q_s: T.Tensor[(b, m, h), FP32],
+            k: T.Tensor[(b, n, d), FP8],
+            k_s: T.Tensor[(b, n), FP32],
+            o: T.Tensor[(b, m, n), FP32],
+        ) -> None:
+            with T.Kernel(b, m, T.ceildiv(n, blk_n1)) as (i_b, i_m, i1_n):
+                q_smem = T.alloc_shared((h, d), FP8)
+                T.copy(q[i_b, i_m, 0, 0], q_smem)
+
+                q_s_frag = T.alloc_fragment(h, FP32)
+                T.copy(q_s[i_b, i_m, 0], q_s_frag)
+
+                for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=2):
+                    k_smem = T.alloc_shared((blk_n2, d), FP8)
+                    T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem)
+
+                    k_s_frag = T.alloc_fragment(blk_n2, FP32)
+                    T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag)
+
+                    logits = T.alloc_fragment((blk_n2, h), FP32)
+                    T.gemm(
+                        k_smem,
+                        q_smem,
+                        logits,
+                        transpose_A=False,
+                        transpose_B=True,
+                        clear_accum=True,
+                    )
+
+                    for i_h, i3_n in T.Parallel(h, blk_n2):
+                        logits[i3_n, i_h] = T.max(logits[i3_n, i_h], 0) * q_s_frag[i_h]
+
+                    logits_sum = T.alloc_fragment(blk_n2, FP32)
+                    T.reduce_sum(logits, logits_sum, dim=1)
+
+                    for i3_n in T.Parallel(blk_n2):
+                        logits_sum[i3_n] *= k_s_frag[i3_n]
+
+                    T.copy(logits_sum, o[i_b, i_m, i1_n * blk_n1 + i2_n * blk_n2])
+
+        return fp8_index_kernel_
+
+
+# ---- PyTorch fallback implementations ----
+
+
+def _act_quant_pytorch(
+    x: torch.Tensor, block_size: int = 128, scale_fmt: str | None = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Pure PyTorch implementation of block-wise FP8 activation quantization.
+
+    Equivalent to the TileLang ``act_quant_kernel``: per-group absmax scaling,
+    optional power-of-2 rounded scales, clamp to FP8 range.
+    """
+    N = x.size(-1)
+    assert N % block_size == 0, f"Last dimension size must be divisible by block_size (block_size={block_size})"
+    num_groups = N // block_size
+    orig_shape = x.shape
+
+    # Flatten to 2D, then group — mirrors the TileLang kernel's (M, N) layout.
+    x_flat = x.reshape(-1, N)  # [M, N]
+    x_grouped = x_flat.reshape(-1, num_groups, block_size)  # [M, G, BS]
+
+    # Per-group absmax
+    amax = x_grouped.abs().amax(dim=-1).clamp(min=1e-4)  # [M, G]
+
+    if scale_fmt is not None:
+        # Power-of-2 rounded scale: scale = 2^(ceil(log2(amax / 448)))
+        scale = torch.pow(2.0, torch.ceil(torch.log2(amax / 448.0)))
+    else:
+        scale = amax / 448.0
+
+    # Quantize: divide each group by its scale, clamp to FP8 range
+    x_q = (x_grouped / scale.unsqueeze(-1)).clamp(-448.0, 448.0).to(torch.float8_e4m3fn)  # [M, G, BS]
+    x_q = x_q.reshape(orig_shape)
+
+    # Scale shape: (*x.shape[:-1], num_groups)
+    scale = scale.reshape(*orig_shape[:-1], num_groups)
+    return x_q, scale
+
+
+def _fp8_index_pytorch(
+    q: torch.Tensor,
+    q_s: torch.Tensor,
+    k: torch.Tensor,
+    k_s: torch.Tensor,
+) -> torch.Tensor:
+    """Pure PyTorch implementation of FP8 index scoring.
+
+    Equivalent to the TileLang ``fp8_index_kernel``:
+        logits = k @ q^T                  (FP8 -> FP32 matmul over D)
+        logits = relu(logits) * q_s       (per-head scale)
+        result = logits.sum(H) * k_s      (reduce heads, scale by k)
+    """
+    q_bf16 = q.to(torch.bfloat16)
+    k_bf16 = k.to(torch.bfloat16)
+    # q: [B, M, H, D], k: [B, T, D] -> logits: [B, M, T, H]
+    # Matches TileLang kernel: logits[n, h] = k[n, :] @ q[h, :]^T
+    logits = torch.einsum("bmhd,btd->bmth", q_bf16, k_bf16)
+    logits = logits.clamp(min=0) * q_s.unsqueeze(-2)  # q_s: [B,M,H] -> [B,M,1,H]
+    result = logits.sum(dim=-1) * k_s.unsqueeze(-2)  # k_s: [B,T] -> [B,1,T]
+    return result
+
+
+def _fp8_index_triton(
+    q: torch.Tensor,
+    q_s: torch.Tensor,
+    k: torch.Tensor,
+    k_s: torch.Tensor,
+) -> torch.Tensor:
+    """Triton FP8 GEMM implementation of FP8 index scoring.
+
+    Uses ``w8a8_fp8_matmul`` from the finegrained-fp8 integration (which dispatches
+    to Triton on Blackwell) for FP8→FP32 matmul, matching vLLM's computation
+    granularity. Post-processing (relu, scale, reduce) is done in FP32.
+
+    Equivalent to the TileLang ``fp8_index_kernel``:
+        logits = dequant(q_fp8, q_scale) @ dequant(k_fp8, k_scale)^T   (FP8 dequant + FP32 matmul)
+        logits = relu(logits) * q_s       (per-head weights, already includes q_scale)
+        result = logits.sum(H) * k_s      (reduce heads, scale by k_scale)
+    """
+    from .finegrained_fp8 import w8a8_fp8_matmul
+
+    B, M, H, D = q.shape
+    T = k.shape[1]
+
+    if B == 1:
+        # Single batch: one matmul for all (M, H) query vectors against all T keys
+        q_flat = q.reshape(M * H, D).contiguous()
+        k_flat = k.reshape(T, D).contiguous()
+        # Create unit scales: fp8_gemm will compute raw FP8 dot products
+        # (dequant with scale=1 is equivalent to using FP8 values directly)
+        ones_q = q_flat.new_ones(M * H, D // 128, dtype=torch.float32)
+        ones_k = k_flat.new_ones(T, D // 128, dtype=torch.float32)
+        logits_flat = w8a8_fp8_matmul(q_flat, k_flat, ones_q, ones_k, [128, 128], torch.float32)
+        # logits_flat: [M*H, T] → reshape to [M, H, T] → transpose to [M, T, H]
+        logits = logits_flat.reshape(M, H, T).permute(0, 2, 1).unsqueeze(0)  # [1, M, T, H]
+    else:
+        # Multi-batch: loop over batches
+        results = []
+        for b in range(B):
+            q_b = q[b].reshape(M * H, D).contiguous()
+            k_b = k[b].reshape(T, D).contiguous()
+            ones_q_b = q_b.new_ones(M * H, D // 128, dtype=torch.float32)
+            ones_k_b = k_b.new_ones(T, D // 128, dtype=torch.float32)
+            logits_b = w8a8_fp8_matmul(q_b, k_b, ones_q_b, ones_k_b, [128, 128], torch.float32)
+            logits_b = logits_b.reshape(M, H, T).permute(0, 2, 1)  # [M, T, H]
+            results.append(logits_b)
+        logits = torch.stack(results, dim=0)  # [B, M, T, H]
+
+    # Post-processing in FP32 — matches TileLang kernel
+    logits = logits.clamp(min=0) * q_s.unsqueeze(-2)  # relu * weights
+    result = logits.sum(dim=-1) * k_s.unsqueeze(-2)  # reduce heads * k_scale
+    return result
+
+
+# ---- Public API: TileLang → Triton → PyTorch fallback ----
+
+# One-time flags — once a backend fails, we stop retrying it.
+_act_quant_use_tilelang = _tilelang_available
+_fp8_index_use_tilelang = _tilelang_available
+_fp8_gemm_use_tilelang = _tilelang_available
+
+# Lazily-loaded Triton kernels from the finegrained-fp8 hub package.
+_triton_act_quant = None
+_triton_fallbacks_loaded = False
+
+
+def _load_triton_fallbacks():
+    """Lazily load Triton FP8 kernels from the finegrained-fp8 hub package."""
+    global _triton_fallbacks_loaded, _triton_act_quant
+    if _triton_fallbacks_loaded:
+        return
+    _triton_fallbacks_loaded = True
+    try:
+        from .finegrained_fp8 import triton_fp8_act_quant
+
+        _triton_act_quant = triton_fp8_act_quant
+    except ImportError:
+        pass
+
+
+def act_quant(
+    x: torch.Tensor, block_size: int = 128, scale_fmt: str | None = None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantizes the input tensor `x` using block-wise quantization.
+
+    Fallback chain: TileLang → Triton (non-ue8m0 only) → PyTorch.
+
+    Args:
+        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last
+            dimension size must be divisible by `block_size`.
+        block_size (int, optional): The size of the blocks to be used for quantization.
+            Default is 128.
+        scale_fmt (Optional[str], optional): The format of the scale. Default is None.
+            When set (e.g. ``"ue8m0"``), scales are rounded to powers of 2 — handled by
+            the PyTorch fallback since the Triton kernel does not support power-of-2 rounding.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - The quantized tensor with dtype `torch.float8_e4m3fn`.
+            - A tensor of scaling factors with dtype `torch.float32`.
+    """
+    assert x.is_contiguous(), "Input tensor must be contiguous"
+    assert x.size(-1) % block_size == 0, (
+        f"Last dimension size must be divisible by block_size (block_size={block_size})"
+    )
+
+    global _act_quant_use_tilelang
+    if _act_quant_use_tilelang:
+        try:
+            N = x.size(-1)
+            y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+            s = x.new_empty(*x.size()[:-1], N // block_size, dtype=torch.float32)
+            kernel = act_quant_kernel(N, round_scale=scale_fmt is not None)
+            kernel(x.view(-1, N), y.view(-1, N), s.view(-1, N // block_size))
+            return y, s
+        except Exception:
+            logger.warning_once("TileLang act_quant compilation failed, falling back to PyTorch implementation")
+            _act_quant_use_tilelang = False
+
+    # Triton fallback — only for non-ue8m0 scales (Triton kernel lacks power-of-2 rounding)
+    if scale_fmt is None:
+        global _triton_act_quant
+        _load_triton_fallbacks()
+        if _triton_act_quant is not None:
+            try:
+                N = x.size(-1)
+                x_flat = x.reshape(-1, N).contiguous()
+                x_q_flat, scale_flat = _triton_act_quant(x_flat, block_size)
+                x_q = x_q_flat.reshape(x.shape)
+                scale = scale_flat.reshape(*x.shape[:-1], N // block_size)
+                return x_q, scale
+            except Exception:
+                logger.warning_once("Triton act_quant failed, falling back to PyTorch")
+                _triton_act_quant = None
+
+    return _act_quant_pytorch(x, block_size, scale_fmt)
+
+
+def fp8_gemm(a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor) -> torch.Tensor:
+    """
+    Perform a matrix multiplication using FP8 precision.
+
+    Args:
+        a (torch.Tensor): The first input matrix, must be contiguous.
+        a_s (torch.Tensor): The scaling factor for the first input matrix, must be contiguous.
+        b (torch.Tensor): The second input matrix, must be contiguous.
+        b_s (torch.Tensor): The scaling factor for the second input matrix, must be contiguous.
+
+    Returns:
+        torch.Tensor: The result of the matrix multiplication.
+    """
+    assert a.is_contiguous() and b.is_contiguous(), "Input tensors must be contiguous"
+    assert a_s.is_contiguous() and b_s.is_contiguous(), "Scaling factor tensors must be contiguous"
+
+    global _fp8_gemm_use_tilelang
+    if _fp8_gemm_use_tilelang:
+        try:
+            K = a.size(-1)
+            M = a.numel() // K
+            N = b.size(0)
+            c = a.new_empty(*a.size()[:-1], N, dtype=torch.get_default_dtype())
+            kernel = fp8_gemm_kernel(N, K)
+            kernel(a.view(M, K), b, c.view(M, N), a_s.view(M, -1), b_s)
+            return c
+        except Exception:
+            logger.warning_once("TileLang fp8_gemm compilation failed, falling back to PyTorch implementation")
+            _fp8_gemm_use_tilelang = False
+
+    # PyTorch fallback: dequantize and matmul
+    group_size = a.shape[-1] // a_s.shape[-1]
+    a_deq = a.to(torch.bfloat16) * a_s.to(torch.bfloat16).repeat_interleave(group_size, dim=-1)
+    b_deq = b.to(torch.bfloat16) * b_s.to(torch.bfloat16).repeat_interleave(group_size, dim=-1).repeat_interleave(
+        group_size, dim=0
+    )
+    return torch.matmul(a_deq, b_deq.T)
+
+
+def fp8_index(
+    q: torch.Tensor,
+    q_s: torch.Tensor,
+    k: torch.Tensor,
+    k_s: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Perform index score using FP8 precision.
+
+    Fallback chain: TileLang → Triton fp8_gemm → PyTorch bf16 einsum.
+
+    The Triton path uses the fp8_gemm kernel from the finegrained-fp8 hub package
+    to compute raw FP8 dot products with FP32 accumulation, matching vLLM's
+    DeepGEMM fp8_mqa_logits computation granularity.
+
+    Args:
+        q (torch.Tensor): The Q tensor, must be contiguous.
+        q_s (torch.Tensor): The scaling factor for Q (float), must be contiguous.
+        k (torch.Tensor): The K tensor, must be contiguous.
+        k_s (torch.Tensor): The scaling factor for K (e8m0 here), must be contiguous.
+
+        fp8 q @ fp8 k -> fp32 logits
+        relu(fp32 logits) * q_s (weights) -> fp32 logits
+        fp32 logits -> fp32 logits_sum
+        fp32 logits_sum * k_s (e8m0) -> fp32 index_score
+    """
+    global _fp8_index_use_tilelang
+    if _fp8_index_use_tilelang:
+        try:
+            return fp8_index_kernel(q.shape[2], q.shape[3])(q, q_s, k, k_s)
+        except Exception:
+            logger.warning_once("TileLang fp8_index compilation failed, falling back to PyTorch implementation")
+            _fp8_index_use_tilelang = False
+
+    # Triton fallback: FP8 matmul with FP32 accumulation (matches vLLM granularity)
+    try:
+        return _fp8_index_triton(q, q_s, k, k_s)
+    except Exception:
+        logger.warning_once("Triton fp8_index failed, falling back to PyTorch bf16 implementation")
+
+    return _fp8_index_pytorch(q, q_s, k, k_s)
diff --git a/src/transformers/integrations/executorch.py b/src/transformers/integrations/executorch.py
index 675a0ea5783a..a835fc44cc71 100644
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@@ -13,8 +13,10 @@
 import logging
 
 import torch
+import torch.utils._pytree as pytree
 
 from ..cache_utils import (
+    Cache,
     DynamicCache,
     DynamicLayer,
     DynamicSlidingWindowLayer,
@@ -25,10 +27,7 @@
 )
 from ..generation.configuration_utils import GenerationConfig
 from ..modeling_utils import PreTrainedModel
-from ..pytorch_utils import (
-    is_torch_greater_or_equal,
-    is_torch_greater_or_equal_than_2_6,
-)
+from ..pytorch_utils import is_torch_greater_or_equal
 
 
 class TorchExportableModuleForVLM:
@@ -881,7 +880,7 @@ def __init__(self, model, max_static_cache_length, batch_size):
         self.static_cache.early_initialization(batch_size, num_heads, head_dim, torch.float32, model_device)
         self.cache = EncoderDecoderCache(self.static_cache, DynamicCache(config=self.config))
 
-        register_dynamic_cache_export_support()
+        register_pytree_cache()
 
         # Register cache buffers to make them exportable
         for i, layer in enumerate(self.static_cache.layers):
@@ -889,7 +888,13 @@ def __init__(self, model, max_static_cache_length, batch_size):
             self.register_buffer(f"value_cache_{i}", layer.values, persistent=False)
             self.register_buffer(f"cumulative_length_{i}", layer.cumulative_length, persistent=False)
 
-    def forward(self, decoder_input_ids, encoder_hidden_states, cache_position):
+    def forward(
+        self,
+        decoder_input_ids: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        cache_position: torch.Tensor,
+        encoder_attention_mask: torch.Tensor | None = None,
+    ):
         # Start by resetting static cache (it's needed to be able to run several generations with the same exported program,
         # as otherwise it's mutated in-place indefinitely - we cannot call reset in-between the `generate` as the program was
         # already exported)
@@ -900,6 +905,7 @@ def forward(self, decoder_input_ids, encoder_hidden_states, cache_position):
         outputs = self.decoder(
             input_ids=decoder_input_ids,
             encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
             past_key_values=self.cache,
             use_cache=True,
         )
@@ -947,7 +953,7 @@ def _export_encoder(self, encoder_input_ids):
 
         return exported_encoder
 
-    def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_position):
+    def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_position, encoder_attention_mask=None):
         target_device = self.full_model.device
         wrapped_decoder = (
             Seq2SeqLMDecoderExportableModuleWithStaticCache(
@@ -963,27 +969,35 @@ def _export_decoder(self, decoder_input_ids, encoder_hidden_states, cache_positi
         decoder_input_ids = decoder_input_ids.to(target_device)
         encoder_hidden_states = encoder_hidden_states.to(target_device)
         cache_position = cache_position.to(target_device)
-
-        # Define dynamic dimension for encoder output sequence length
-        encoder_seq_len_dim = torch.export.Dim("encoder_hidden_seq_length", max=self.max_hidden_seq_length)
-
-        # Export the decoder
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = encoder_attention_mask.to(target_device)
+
+        # Export the decoder.
+        # encoder_hidden_states uses a static shape to avoid a symbolic-shape
+        # conflict with the static KV cache size during torch.export. Callers
+        # that pad encoder inputs to a fixed max length (e.g. max_hidden_seq_length)
+        # should pass encoder_hidden_states of that shape.
         with torch.no_grad():
             exported_decoder = torch.export.export(
                 wrapped_decoder,
-                (decoder_input_ids, encoder_hidden_states, cache_position),
-                dynamic_shapes={
-                    "decoder_input_ids": None,
-                    "encoder_hidden_states": {1: encoder_seq_len_dim},
-                    "cache_position": None,
-                },
+                (decoder_input_ids, encoder_hidden_states, cache_position, encoder_attention_mask),
+                dynamic_shapes=None,
                 strict=True,
             )
 
         return exported_decoder
 
-    def export(self, encoder_input_ids=None, decoder_input_ids=None, encoder_hidden_states=None, cache_position=None):
+    def export(
+        self,
+        encoder_input_ids=None,
+        decoder_input_ids=None,
+        encoder_hidden_states=None,
+        cache_position=None,
+        encoder_attention_mask=None,
+    ):
         device = self.full_model.device
+        max_cache_len = self.generation_config.cache_config.get("max_cache_len")
+        batch_size = self.generation_config.cache_config.get("batch_size")
         example_encoder_input_ids = (
             encoder_input_ids
             if encoder_input_ids is not None
@@ -1001,14 +1015,22 @@ def export(self, encoder_input_ids=None, decoder_input_ids=None, encoder_hidden_
             encoder_hidden_states
             if encoder_hidden_states is not None
             else torch.zeros(
-                (self.generation_config.cache_config.get("batch_size"), 10, self.config.d_model),
+                (batch_size, max_cache_len, self.config.d_model),
                 dtype=torch.float32,
                 device=device,
             )
         )
+        example_encoder_attention_mask = (
+            encoder_attention_mask
+            if encoder_attention_mask is not None
+            else torch.ones((batch_size, max_cache_len), dtype=torch.long, device=device)
+        )
         self.exported_encoder = self._export_encoder(example_encoder_input_ids)
         self.exported_decoder = self._export_decoder(
-            example_decoder_input_ids, example_encoder_hidden_states, example_cache_position
+            example_decoder_input_ids,
+            example_encoder_hidden_states,
+            example_cache_position,
+            example_encoder_attention_mask,
         )
 
         # Return self to allow chaining
@@ -1025,6 +1047,22 @@ def generate(self, prompt_token_ids, max_new_tokens):
             # Run encoder
             encoder_output = self.exported_encoder.module()(prompt_token_ids)
 
+            # Build encoder attention mask: 1 at real token positions, 0 at padding.
+            # Assumes padding token id is 0 (standard for T5 and most seq2seq models).
+            max_cache_len = self.generation_config.cache_config.get("max_cache_len")
+            batch_size = prompt_token_ids.shape[0]
+            encoder_attention_mask = (prompt_token_ids != 0).long()
+            # Pad or trim to max_cache_len so shape matches the static export
+            if encoder_attention_mask.shape[1] < max_cache_len:
+                pad = torch.zeros(
+                    (batch_size, max_cache_len - encoder_attention_mask.shape[1]),
+                    dtype=torch.long,
+                    device=model_device,
+                )
+                encoder_attention_mask = torch.cat([encoder_attention_mask, pad], dim=1)
+            else:
+                encoder_attention_mask = encoder_attention_mask[:, :max_cache_len]
+
             # Initialize with start token (0 for T5) on the correct device
             decoder_input_ids = torch.tensor([[0]], dtype=torch.long, device=model_device)
             generated_ids = [0]
@@ -1033,7 +1071,10 @@ def generate(self, prompt_token_ids, max_new_tokens):
             for i in range(max_new_tokens - 1):
                 # Run decoder for next token prediction
                 logits = self.exported_decoder.module()(
-                    decoder_input_ids, encoder_output, torch.tensor([i], dtype=torch.long, device=model_device)
+                    decoder_input_ids,
+                    encoder_output,
+                    torch.tensor([i], dtype=torch.long, device=model_device),
+                    encoder_attention_mask,
                 )
 
                 # Get next token
@@ -1067,7 +1108,7 @@ def export_with_dynamic_cache(
         Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
     """
 
-    register_dynamic_cache_export_support()
+    register_pytree_cache()
 
     with torch.no_grad():
         exported_program = torch.export.export(
@@ -1084,54 +1125,97 @@ def export_with_dynamic_cache(
         return exported_program
 
 
-def register_dynamic_cache_export_support():
-    """
-    Utilities for `DynamicCache` <> torch.export support
-    """
-
+def _register_pytree_node(cls, flatten_fn, unflatten_fn, flatten_with_keys_fn):
     try:
-        torch.utils._pytree.register_pytree_node(
-            DynamicCache,
-            lambda dynamic_cache: torch.utils._pytree._dict_flatten(_get_cache_dict(dynamic_cache)),
-            _unflatten_dynamic_cache,
-            serialized_type_name=f"{DynamicCache.__module__}.{DynamicCache.__name__}",
-            flatten_with_keys_fn=lambda dynamic_cache: torch.utils._pytree._dict_flatten_with_keys(
-                _get_cache_dict(dynamic_cache)
-            ),
+        pytree.register_pytree_node(
+            cls,
+            flatten_fn,
+            unflatten_fn,
+            serialized_type_name=f"{cls.__module__}.{cls.__name__}",
+            flatten_with_keys_fn=flatten_with_keys_fn,
         )
-        # TODO (tmanlaibaatar) This won't be needed in torch 2.7.
-        torch.fx._pytree.register_pytree_flatten_spec(
-            DynamicCache,
-            lambda cache, spec: torch.fx._pytree._dict_flatten_spec(_get_cache_dict(cache), spec),
-        )
-    # Catching this in case there are multiple runs for some test runs
-    except ValueError as e:
-        if "already registered as pytree node" not in str(e):
+    except ValueError as error:
+        if "already registered as pytree node" not in str(error):
             raise
 
 
-def _get_cache_dict(cache: DynamicCache):
-    """Convert cache to dictionary format for pytree operations."""
-    if any(not isinstance(layer, (DynamicLayer, DynamicSlidingWindowLayer)) for layer in cache.layers):
-        raise RuntimeError("This pytree flattening function should only be applied to DynamicCache")
+def _register_pytree_cache_layer(cache_layer_cls):
+    def _flatten_layer(layer):
+        attributes = {
+            "keys": layer.keys,
+            "values": layer.values,
+            "is_initialized": layer.is_initialized,
+        }
+        for name in (
+            "max_cache_len",
+            "max_batch_size",
+            "num_heads",
+            "k_head_dim",
+            "v_head_dim",
+            "cumulative_length",
+            "cumulative_length_int",
+            "sliding_window",
+        ):
+            if hasattr(layer, name):
+                attributes[name] = getattr(layer, name)
+        return list(attributes.values()), list(attributes.keys())
+
+    def _unflatten_layer(values, context):
+        attributes = dict(zip(context, values))
+
+        if cache_layer_cls is StaticLayer:
+            layer = cache_layer_cls(max_cache_len=attributes["max_cache_len"])
+        elif cache_layer_cls is StaticSlidingWindowLayer:
+            layer = cache_layer_cls(
+                max_cache_len=attributes["max_cache_len"],
+                sliding_window=attributes["max_cache_len"],
+            )
+        elif cache_layer_cls is DynamicSlidingWindowLayer:
+            layer = cache_layer_cls(sliding_window=attributes["sliding_window"])
+        else:
+            layer = cache_layer_cls()
+
+        for name, value in attributes.items():
+            setattr(layer, name, value)
+        return layer
+
+    def _flatten_layer_with_keys(layer):
+        values, context = _flatten_layer(layer)
+        return [(pytree.MappingKey(key), value) for key, value in zip(context, values)], context
+
+    _register_pytree_node(cache_layer_cls, _flatten_layer, _unflatten_layer, _flatten_layer_with_keys)
+
+
+def _register_pytree_cache(cache_cls):
+    def _flatten_cache(cache):
+        attributes = {
+            "layers": cache.layers,
+            "offloading": cache.offloading,
+            "only_non_sliding": getattr(cache, "only_non_sliding", True),
+        }
+        return list(attributes.values()), list(attributes.keys())
+
+    def _flatten_cache_with_keys(cache):
+        values, context = _flatten_cache(cache)
+        return [(pytree.MappingKey(key), value) for key, value in zip(context, values)], context
+
+    def _unflatten_cache(values, context):
+        attributes = dict(zip(context, values))
+        cache = Cache(
+            layers=attributes["layers"],
+            offloading=attributes["offloading"],
+            offload_only_non_sliding=attributes["only_non_sliding"],
+        )
+        cache.__class__ = cache_cls
+        return cache
 
-    if not is_torch_greater_or_equal_than_2_6:
-        logging.warning("DynamicCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions.")
+    _register_pytree_node(cache_cls, _flatten_cache, _unflatten_cache, _flatten_cache_with_keys)
 
-    return {
-        "key_cache": [layer.keys for layer in cache.layers if layer.keys is not None],
-        "value_cache": [layer.values for layer in cache.layers if layer.values is not None],
-    }
 
+def register_pytree_cache():
+    """Register cache classes as pytrees for torch.export."""
+    for cache_layer_cls in (StaticLayer, StaticSlidingWindowLayer, DynamicLayer, DynamicSlidingWindowLayer):
+        _register_pytree_cache_layer(cache_layer_cls)
 
-def _unflatten_dynamic_cache(values, context: torch.utils._pytree.Context):
-    dictionary = torch.utils._pytree._dict_unflatten(values, context)
-    cache = DynamicCache()
-    # Reconstruct layers from keys and values lists
-    key_list = dictionary.get("key_cache", [])
-    value_list = dictionary.get("value_cache", [])
-    for idx in range(max(len(key_list), len(value_list))):
-        key = key_list[idx] if idx < len(key_list) else None
-        value = value_list[idx] if idx < len(value_list) else None
-        cache.update(key, value, idx)
-    return cache
+    for cache_cls in (StaticCache, DynamicCache):
+        _register_pytree_cache(cache_cls)
diff --git a/src/transformers/integrations/finegrained_fp8.py b/src/transformers/integrations/finegrained_fp8.py
index c64f1ce23ec2..02577b4e85b1 100644
--- a/src/transformers/integrations/finegrained_fp8.py
+++ b/src/transformers/integrations/finegrained_fp8.py
@@ -128,6 +128,10 @@ def _load_deepgemm_kernel():
 
     # DeepGEMM requires Hopper (SM90) or newer for FP8 WGMMA instructions
     major = torch.cuda.get_device_capability()[0]
+    if major >= 10:
+        raise ImportError(
+            "DeepGEMM is not yet supported on Blackwell (SM100+) GPUs. Falling back to Triton finegrained-fp8 kernel."
+        )
     if major < 9:
         raise ImportError(
             f"DeepGEMM requires a Hopper (SM90+) or newer GPU, but the current device "
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index c9ba021c54db..8bf7bfdba648 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -259,6 +259,7 @@
         "attention.head_count_kv": "num_key_value_heads",
         "attention.layer_norm_rms_epsilon": "rms_norm_eps",
         "attention.sliding_window": "sliding_window",
+        "attention.logit_softcapping": "attn_logit_softcapping",
         "vocab_size": "vocab_size",
     },
     "gemma3": {
@@ -275,6 +276,7 @@
         "attention.head_count_kv": "num_key_value_heads",
         "attention.layer_norm_rms_epsilon": "rms_norm_eps",
         "attention.sliding_window": "sliding_window",
+        "attention.logit_softcapping": "attn_logit_softcapping",
         "vocab_size": "vocab_size",
     },
     "umt5": {
diff --git a/src/transformers/integrations/hqq.py b/src/transformers/integrations/hqq.py
index 083ec53a2fd3..f83007410f7d 100755
--- a/src/transformers/integrations/hqq.py
+++ b/src/transformers/integrations/hqq.py
@@ -127,3 +127,135 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
         logger.warning("No linear modules were found in your model for quantization.")
 
     return model
+
+
+class HqqQuantize:
+    """HQQ quantization operation for the new weight loading flow."""
+
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(
+        self,
+        input_dict,
+        full_layer_name=None,
+        model=None,
+        **kwargs,
+    ):
+        from hqq.core.quantize import HQQLinear
+
+        from ..quantizers.quantizers_utils import get_module_from_name
+
+        # input_dict has {param_name: [tensor]} for the weight
+        value = list(input_dict.values())[0]
+        value = value[0] if isinstance(value, list) else value
+
+        # full_layer_name is e.g. "model.layers.0.self_attn.q_proj.weight"
+        module_name = full_layer_name.rsplit(".", 1)[0]
+        module, _ = get_module_from_name(model, full_layer_name)
+
+        # Load weight into the nn.Linear module
+        module.weight = torch.nn.Parameter(value, requires_grad=False)
+
+        # Get the quant_config that was set in _process_model_before_weight_loading
+        quant_config = getattr(module, "quant_config", None)
+        if quant_config is None:
+            # Module is skipped from quantization, just return the weight as-is
+            return {full_layer_name: value}
+
+        # Determine target device and compute dtype
+        target_device = value.device
+        compute_dtype = self.hf_quantizer.dtype
+
+        # Create HQQLinear from the nn.Linear
+        hqq_layer = HQQLinear(
+            module,
+            quant_config=quant_config,
+            compute_dtype=compute_dtype,
+            device=target_device,
+            del_orig=True,
+        )
+
+        if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+            hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+        if self.hf_quantizer.using_multi_gpu:
+            hqq_layer = self.hf_quantizer._patch_layer_for_multigpu(hqq_layer)
+
+        # Replace the module in the model
+        parent_module_name, _, child_name = module_name.rpartition(".")
+        parent_module = model.get_submodule(parent_module_name) if parent_module_name else model
+        setattr(parent_module, child_name, hqq_layer)
+
+        # Mark as loaded so it's not reported as missing
+        missing_keys = kwargs.get("missing_keys")
+        if missing_keys is not None:
+            missing_keys.discard(full_layer_name)
+
+        # Return empty dict so the loading code doesn't try to set params
+        return {}
+
+
+class HqqDeserialize:
+    """Deserialize HQQ pre-quantized weights into an HQQLinear module."""
+
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(
+        self,
+        input_dict,
+        full_layer_name=None,
+        model=None,
+        **kwargs,
+    ):
+        from hqq.core.quantize import HQQLinear
+
+        # Unwrap list values
+        state_dict = {}
+        for key, value in input_dict.items():
+            state_dict[key] = value[0] if isinstance(value, list) else value
+
+        # If W_q is not present, this is not an HQQ-quantized layer — pass through
+        if "W_q" not in state_dict:
+            return input_dict
+
+        # full_layer_name is e.g. "model.layers.0.self_attn.v_proj.weight"
+        # (target pattern "weight" appended to module path)
+        module_name = full_layer_name.rsplit(".", 1)[0]
+
+        parent_name, _, child_name = module_name.rpartition(".")
+        parent = model.get_submodule(parent_name) if parent_name else model
+
+        # Create empty HQQLinear
+        hqq_layer = HQQLinear(
+            None,
+            None,
+            compute_dtype=self.hf_quantizer.dtype or torch.float16,
+            device="cpu",
+            initialize=False,
+        )
+
+        # Make W_q an nn.Parameter as HQQ expects
+        if "W_q" in state_dict:
+            state_dict["W_q"] = torch.nn.Parameter(state_dict["W_q"], requires_grad=False)
+
+        hqq_layer.load_state_dict(state_dict)
+
+        if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+            hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+        if self.hf_quantizer.using_multi_gpu:
+            hqq_layer = self.hf_quantizer._patch_layer_for_multigpu(hqq_layer)
+
+        setattr(parent, child_name, hqq_layer)
+
+        # Mark weight and bias as loaded
+        missing_keys = kwargs.get("missing_keys")
+        if missing_keys is not None:
+            missing_keys.discard(full_layer_name)
+            # Also discard bias since HQQLinear handles it internally
+            bias_key = module_name + ".bias"
+            missing_keys.discard(bias_key)
+
+        return {}
diff --git a/src/transformers/integrations/hub_kernels.py b/src/transformers/integrations/hub_kernels.py
index 70a343424aa8..c541e939b07b 100644
--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@@ -359,7 +359,11 @@ def load_and_register_attn_kernel(
 
     # Register the kernel as a valid attention
     ALL_ATTENTION_FUNCTIONS.register(attn_implementation, kernel_function)
-    ALL_MASK_ATTENTION_FUNCTIONS.register(attn_implementation, ALL_MASK_ATTENTION_FUNCTIONS["flash_attention_2"])
+
+    # Allow the kernel module to declare its preferred mask function (e.g., MASK_FUNCTION = "sdpa").
+    # Falls back to "flash_attention_2" for backward compatibility with existing kernels.
+    mask_type = getattr(kernel, "MASK_FUNCTION", "flash_attention_2")
+    ALL_MASK_ATTENTION_FUNCTIONS.register(attn_implementation, ALL_MASK_ATTENTION_FUNCTIONS[mask_type])
 
     return kernel
 
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 2656b7169c62..83a653261cbf 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -2261,7 +2261,7 @@ class SwanLabCallback(TrainerCallback):
     A [`TrainerCallback`] that logs metrics, media, model checkpoints to [SwanLab](https://swanlab.cn/).
     """
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         if not is_swanlab_available():
             raise RuntimeError("SwanLabCallback requires swanlab to be installed. Run `pip install swanlab`.")
         import swanlab
@@ -2269,6 +2269,7 @@ def __init__(self):
         self._swanlab = swanlab
         self._initialized = False
         self._log_model = os.getenv("SWANLAB_LOG_MODEL", None)
+        self._init_kwargs = kwargs
 
     def setup(self, args, state, model, **kwargs):
         """
@@ -2352,6 +2353,7 @@ def setup(self, args, state, model, **kwargs):
                 init_args["resume"] = "allow"
 
             if self._swanlab.get_run() is None:
+                init_args.update(self._init_kwargs)
                 self._swanlab.init(
                     **init_args,
                 )
diff --git a/src/transformers/integrations/moe.py b/src/transformers/integrations/moe.py
index c8a8e87f3621..acc9b3575a14 100644
--- a/src/transformers/integrations/moe.py
+++ b/src/transformers/integrations/moe.py
@@ -15,6 +15,8 @@
 from collections.abc import Callable
 from functools import wraps
 
+from torch.distributed.tensor import DTensor
+
 from ..utils import logging
 from ..utils.generic import GeneralInterface
 from ..utils.import_utils import (
@@ -354,12 +356,17 @@ def _grouped_linear(
     Returns:
         `torch.Tensor`: Output tensor of shape (S, output_dim).
     """
+    # torch._grouped_mm is not registered for autocast, so we need to ensure
+    # input and weight have the same dtype (e.g. LayerNorm outputs float32 under
+    # autocast while weights may be bfloat16).
+    input = input.to(weight.dtype)
+
     if is_transposed:
         # (S, input_dim) @ grouped (num_experts, input_dim, output_dim) -> (S, output_dim)
         out = _grouped_mm(input, weight, offs=offs)
     else:
         # (S, input_dim) @ grouped (num_experts, output_dim, input_dim).T -> (S, output_dim)
-        out = _grouped_mm(input, weight.transpose(-2, -1), offs=offs)
+        out = _grouped_mm(input, weight.transpose(-2, -1).contiguous(), offs=offs)
 
     if bias is not None:
         # We should be able to pass bias to the grouped_mm call, but it's not yet supported.
@@ -401,21 +408,29 @@ def grouped_mm_experts_forward(
     # Compute offsets for grouped_mm
     # using histc instead of bincount to avoid cuda graph issues
     # With deterministic algorithms, CPU only supports float input, CUDA only supports int input.
-    histc_input = expert_ids_g.float() if device.type == "cpu" else expert_ids_g.int()
+
+    # torch.histc() does not support integer dtypes on CPU and MPS.
+    # It works well and is more efficient on CUDA when using int.
+    # For all other backends (XPU, TPU/XLA, HPU, etc.), we conservatively
+    # use float32 as it has broader operator suppor
+    histc_input = expert_ids_g.int() if device.type == "cuda" else expert_ids_g.to(torch.float32)
     tokens_per_expert = torch.histc(histc_input, bins=self.num_experts, min=0, max=self.num_experts - 1)
     offsets = torch.cumsum(tokens_per_expert, dim=0, dtype=torch.int32)
 
+    def _local(p):
+        return p.to_local() if isinstance(p, DTensor) else p
+
     # Select expert weights and biases
     # NOTE: We keep all experts here and rely on offsets to target the active ones.
     # I have already implemented a version that only passes the active experts, but
     # to do so I had to use torch.unique which breaks the graph capture (data-dependent).
     # Also there were no speedup gains from it in my experiments, even in eager mode.
     if self.has_gate:
-        selected_weights = self.gate_up_proj
-        selected_biases = self.gate_up_proj_bias[expert_ids_g] if self.has_bias else None
+        selected_weights = _local(self.gate_up_proj)
+        selected_biases = _local(self.gate_up_proj_bias)[expert_ids_g] if self.has_bias else None
     else:
-        selected_weights = self.up_proj
-        selected_biases = self.up_proj_bias[expert_ids_g] if self.has_bias else None
+        selected_weights = _local(self.up_proj)
+        selected_biases = _local(self.up_proj_bias)[expert_ids_g] if self.has_bias else None
 
     # --- Up projection per expert (grouped) ---
     proj_out = _grouped_linear(
@@ -431,8 +446,8 @@ def grouped_mm_experts_forward(
         proj_out = self.act_fn(proj_out)  # (S, intermediate_dim)
 
     # Select down projection weights and biases
-    selected_weights = self.down_proj
-    selected_biases = self.down_proj_bias[expert_ids_g] if self.has_bias else None
+    selected_weights = _local(self.down_proj)
+    selected_biases = _local(self.down_proj_bias)[expert_ids_g] if self.has_bias else None
 
     # --- Down projection per expert (grouped) ---
     proj_out = _grouped_linear(
diff --git a/src/transformers/integrations/mxfp4.py b/src/transformers/integrations/mxfp4.py
index 67d9420659af..018507a5134b 100644
--- a/src/transformers/integrations/mxfp4.py
+++ b/src/transformers/integrations/mxfp4.py
@@ -498,15 +498,18 @@ def mlp_forward(self, hidden_states):
     else:
         routing = triton_kernels_hub.routing.routing
 
-    batch_size = hidden_states.shape[0]
-    hidden_states = hidden_states.reshape(-1, self.router.hidden_dim)
+    is_3d = hidden_states.ndim == 3
+    if is_3d:
+        batch_size, seq_len, _ = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, self.router.hidden_dim)
     router_logits = nn.functional.linear(hidden_states, self.router.weight, self.router.bias)
 
     with on_device(router_logits.device):
         routing_data, gather_idx, scatter_idx = routing(router_logits, self.router.top_k)
 
     routed_out = self.experts(hidden_states, routing_data, gather_idx, scatter_idx=scatter_idx)
-    routed_out = routed_out.reshape(batch_size, -1, self.router.hidden_dim)
+    if is_3d:
+        routed_out = routed_out.reshape(batch_size, seq_len, self.router.hidden_dim)
     return routed_out, router_logits
 
 
diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py
index 7b93e0a134b8..cad07bc2d3fc 100644
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@@ -34,6 +34,7 @@
     Transpose,
     WeightConverter,
     WeightRenaming,
+    rename_source_key,
 )
 from ..utils import (
     CONFIG_NAME,
@@ -47,7 +48,7 @@
     logging,
 )
 from ..utils.hub import DownloadKwargs
-from ..utils.loading_report import log_state_dict_report
+from ..utils.loading_report import LoadStateDictInfo, log_state_dict_report
 
 
 if is_torch_available():
@@ -506,6 +507,7 @@ def load_adapter(
                 `find_adapter_config_file` method.
         """
         from peft import PeftType
+        from peft.tuners.tuners_utils import BaseTunerLayer
         from peft.utils.save_and_load import _maybe_shard_state_dict_for_tp
 
         from ..modeling_utils import LoadStateDictConfig, _get_resolved_checkpoint_files, load_state_dict
@@ -618,45 +620,92 @@ def load_adapter(
 
         device_map = getattr(self, "hf_device_map", {"": self.device})
 
-        # If the model is tensor parallel, we handle the sharding of the state dict here since the logic in `self._load_pretrained_model`
-        # is not compatible with the way PEFT adapter should be sharded.
-        has_tp_adapters = False
-        for module in self.modules():
-            tp_info = getattr(module, "_tp_info", None)
-            if tp_info is not None:
-                has_tp_adapters = True
-                break
-
-        if has_tp_adapters:
+        def _resolve_adapter_state_dict():
+            # Materialize the adapter state dict from `adapter_state_dict` or `checkpoint_files`. Used by paths
+            # that bypass `self._load_pretrained_model` (which would otherwise read the files itself).
             all_pointer = set()
             if adapter_state_dict is not None:
-                merged_state_dict = adapter_state_dict
-            elif (
-                checkpoint_files is not None
-                and checkpoint_files[0].endswith(".safetensors")
-                and adapter_state_dict is None
-            ):
+                return adapter_state_dict
+            if checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors"):
                 merged_state_dict = {}
                 for file in checkpoint_files:
                     file_pointer = safe_open(file, framework="pt", device="cpu")
                     all_pointer.add(file_pointer)
                     for k in file_pointer.keys():
                         merged_state_dict[k] = file_pointer.get_tensor(k)
+                return merged_state_dict
             # Checkpoints are .bin
-            elif checkpoint_files is not None:
+            if checkpoint_files is not None:
                 merged_state_dict = {}
                 for ckpt_file in checkpoint_files:
                     merged_state_dict.update(load_state_dict(ckpt_file))
-            else:
-                raise ValueError("Neither a state dict nor checkpoint files were found.")
+                return merged_state_dict
+            raise ValueError("Neither a state dict nor checkpoint files were found.")
 
-            adapter_state_dict = merged_state_dict
+        def set_inference_mode(model):
+            model.eval()
+            for module in model.modules():
+                if isinstance(module, BaseTunerLayer):
+                    module.requires_grad_(False)
+
+        # If the model is tensor parallel, we handle the sharding of the state dict here since the logic in `self._load_pretrained_model`
+        # is not compatible with the way PEFT adapter should be sharded.
+        has_tp_adapters = False
+        for module in self.modules():
+            tp_info = getattr(module, "_tp_info", None)
+            if tp_info is not None:
+                has_tp_adapters = True
+                break
+
+        if has_tp_adapters:
+            adapter_state_dict = _resolve_adapter_state_dict()
 
             if any(not isinstance(v, torch.Tensor) for v in adapter_state_dict.values()):
                 raise ValueError("Expected all values in the adapter state dict to be tensors.")
 
             _maybe_shard_state_dict_for_tp(self, adapter_state_dict, adapter_name)
 
+        if hotswap:
+            # Bypass the standard loader and use PEFT's hotswap path so that LoRA weights
+            # whose rank differs from the existing adapter's are copied (and zero-padded)
+            # in place rather than triggering a "size mismatch" reinit, and so the LoRA
+            # scaling is updated alongside the weights.
+            from peft.utils.hotswap import check_hotswap_configs_compatible, hotswap_adapter_from_state_dict
+
+            adapter_state_dict = _resolve_adapter_state_dict()
+
+            # need to apply conversions manually as we don't use _load_pretrained_model
+            renamings = [r for r in peft_weight_conversions if isinstance(r, WeightRenaming)]
+            converters = [c for c in peft_weight_conversions if isinstance(c, WeightConverter)]
+            meta_state_dict = self.state_dict()
+            processed_state_dict = {}
+            for key, value in adapter_state_dict.items():
+                renamed_key, _ = rename_source_key(key, renamings, converters, self.base_model_prefix, meta_state_dict)
+                processed_state_dict[renamed_key] = value
+
+            check_hotswap_configs_compatible(self.peft_config[adapter_name], peft_config)
+            try:
+                hotswap_adapter_from_state_dict(
+                    model=self,
+                    state_dict=processed_state_dict,
+                    adapter_name=adapter_name,
+                    config=peft_config,
+                )
+            except Exception as e:
+                logger.error(f"Hotswapping {adapter_name} was unsuccessful with the following error:\n{e}")
+                raise
+
+            if peft_config.inference_mode:
+                set_inference_mode(self)
+
+            return LoadStateDictInfo(
+                missing_keys=set(),
+                unexpected_keys=set(),
+                mismatched_keys=set(),
+                error_msgs=[],
+                conversion_errors={},
+            )
+
         load_config = replace(
             load_config,
             pretrained_model_name_or_path=peft_model_id,
@@ -676,12 +725,7 @@ def load_adapter(
         )
 
         if peft_config.inference_mode:
-            from peft.tuners.tuners_utils import BaseTunerLayer
-
-            self.eval()
-            for module in self.modules():
-                if isinstance(module, BaseTunerLayer):
-                    module.requires_grad_(False)
+            set_inference_mode(self)
 
         adapter_key_markers = {adapter_name}
         if peft_config is not None and getattr(peft_config, "peft_type", None) is not None:
@@ -699,6 +743,16 @@ def is_adapter_key(key: str) -> bool:
             loading_info=loading_info,
             logger=logger,
         )
+
+        if self._prepare_peft_hotswap_kwargs is not None:
+            # Apply once, after the first adapter has been loaded but before the model is
+            # compiled, so the LoRA layers get padded up to target_rank and a later adapter
+            # with a different rank can be hot-swapped in without recompiling.
+            from peft.utils.hotswap import prepare_model_for_compiled_hotswap
+
+            prepare_model_for_compiled_hotswap(self, config=peft_config, **self._prepare_peft_hotswap_kwargs)
+            self._prepare_peft_hotswap_kwargs = None
+
         return loading_info
 
     def enable_peft_hotswap(
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
index bdf82e8490f0..21f0a833ef08 100644
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@@ -29,6 +29,7 @@
     import torch
     import torch.distributed as dist
     from torch import nn
+    from torch.distributed.tensor import DTensor, Shard
 
     # Cache this result has it's a C FFI call which can be pretty time-consuming
     _torch_distributed_available = torch.distributed.is_available()
@@ -46,8 +47,11 @@ def initialize_tensor_parallelism(
     """
     if tp_size is not None and tp_plan is None:
         raise ValueError("tp_plan has to be set when tp_size is passed.")
-    if tp_plan is not None and device_map is not None:
-        raise ValueError("`tp_plan` and `device_map` are mutually exclusive. Choose either one for parallelization.")
+    if tp_plan is not None and device_map is not None and device_map != "meta" and device_mesh is None:
+        raise ValueError(
+            "`tp_plan` and `device_map` are mutually exclusive. "
+            "Choose either one for parallelization or include a `device_mesh`."
+        )
     if device_mesh is None:
         if not is_torch_greater_or_equal("2.5"):
             raise OSError("Tensor parallel is only supported for `torch>=2.5`.")
@@ -97,7 +101,8 @@ def initialize_tensor_parallelism(
                 )
             device_mesh = device_mesh["tp"]
         tp_size = device_mesh.size()
-        device_map = torch.device(f"{device_mesh.device_type}:{int(os.environ['LOCAL_RANK'])}")
+        if device_map is None:
+            device_map = torch.device(f"{device_mesh.device_type}:{int(os.environ['LOCAL_RANK'])}")
 
     return device_map, device_mesh, tp_size
 
@@ -130,6 +135,17 @@ def _get_parameter_tp_plan(parameter_name: str, tp_plan: dict[str, str], is_weig
     return None
 
 
+def get_ep_sharded_param_names(model) -> list[str]:
+    """FQNs of parameters whose data is per-rank unique under EP sharding."""
+    if not getattr(model, "has_ep", False):
+        return []
+    return [
+        name
+        for name, _ in model.named_parameters()
+        if _get_parameter_tp_plan(parameter_name=name, tp_plan=model.tp_plan, is_weight=True) == "grouped_gemm"
+    ]
+
+
 # =============================================================================
 # Tensor Sharding Utilities
 # =============================================================================
@@ -685,6 +701,14 @@ def update_module_attributes(self, module: nn.Module):
         """
         pass
 
+    def post_shard_wrap(self, param: nn.Parameter) -> nn.Parameter:
+        """
+        Optional final wrap applied to a parameter after `shard_tensor` and before it is
+        attached to the module. Default is identity. Subclasses can override to e.g. wrap
+        the local shard as a DTensor.
+        """
+        return param
+
 
 class ColwiseParallel(TensorParallelLayer):
     """
@@ -966,8 +990,8 @@ def _prepare_output_fn(self, mod, outputs, device_mesh):
         if self.embedding_dim_sharding == 0 and hasattr(mod, "_input_mask"):
             input_mask = mod._input_mask
             # Use multiplication instead of in-place assignment to preserve gradients
-            mask_expanded = input_mask.unsqueeze(-1).expand_as(outputs)
-            outputs = outputs * (~mask_expanded).to(outputs.dtype)
+            mask = input_mask.unsqueeze(-1)
+            outputs = outputs * (~mask).to(outputs.dtype)
             del mod._input_mask
 
         return all_reduce_forward(outputs, device_mesh)
@@ -1078,6 +1102,15 @@ def update_module_attributes(self, module: nn.Module):
         if hasattr(module, "num_experts"):
             module.num_experts = self.get_expected_sharded_shape((self.empty_param.shape[0],))[0]
 
+    def post_shard_wrap(self, param: nn.Parameter) -> nn.Parameter:
+        """
+        Wrap the EP-sharded local tensor as a DTensor on the TP/EP mesh. Without this, the
+        optimizer's foreach ops error with "mixed Tensor and DTensor" against the
+        FSDP-wrapped DTensor params on the rest of the model.
+        """
+        dt = DTensor.from_local(param.data, self.device_mesh, [Shard(0)], run_check=False)
+        return nn.Parameter(dt, requires_grad=param.requires_grad)
+
 
 class RouterParallel(TensorParallelLayer):
     """
@@ -1488,6 +1521,8 @@ def shard_and_distribute_module(
     # otherwise loading is crazy slow
     if not isinstance(param, torch.nn.Parameter):
         param = torch.nn.Parameter(param, requires_grad=empty_param.is_floating_point())
+    if current_shard_plan is not None:
+        param = tp_layer.post_shard_wrap(param)
     setattr(module_to_tp, param_type, param)
     if tp_layer is not None:
         tp_layer.update_module_attributes(module_to_tp)
diff --git a/src/transformers/integrations/torchao.py b/src/transformers/integrations/torchao.py
index 421a004dd6e9..2fa20a3982b9 100644
--- a/src/transformers/integrations/torchao.py
+++ b/src/transformers/integrations/torchao.py
@@ -35,19 +35,10 @@
 logger = logging.get_logger(__name__)
 
 
-def _quantization_type(weight):
-    from torchao.dtypes import AffineQuantizedTensor
-    from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor
-
-    if isinstance(weight, AffineQuantizedTensor):
-        return f"{weight.__class__.__name__}({weight._quantization_type()})"
-
-    if isinstance(weight, LinearActivationQuantizedTensor):
-        return f"{weight.__class__.__name__}(activation={weight.input_quant_func}, weight={_quantization_type(weight.original_weight_tensor)})"
-
-
 def _linear_extra_repr(self):
-    weight = _quantization_type(self.weight)
+    from torchao.utils import TorchAOBaseTensor
+
+    weight = self.weight.__class__.__name__ if isinstance(self.weight, TorchAOBaseTensor) else None
     if weight is None:
         return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight=None"
     else:
diff --git a/src/transformers/integrations/tpu.py b/src/transformers/integrations/tpu.py
index a329a7fcdd84..e05776aab7fe 100644
--- a/src/transformers/integrations/tpu.py
+++ b/src/transformers/integrations/tpu.py
@@ -162,7 +162,9 @@ def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
     return model
 
 
-def save_tpu_checkpoint(model, args, accelerator, processing_class, is_fsdp_xla_v1_enabled, output_dir=None):
+def save_tpu_checkpoint(
+    model, args, accelerator, processing_class, is_fsdp_xla_v1_enabled, is_fsdp_xla_v2_enabled, output_dir=None
+):
     """
     Saves a model checkpoint on TPU/XLA devices.
 
@@ -175,10 +177,13 @@ def save_tpu_checkpoint(model, args, accelerator, processing_class, is_fsdp_xla_
         accelerator (`Accelerator`): The accelerator instance.
         processing_class: The processing class (tokenizer/processor) to save alongside the model.
         is_fsdp_xla_v1_enabled (`bool`): Whether FSDP XLA v1 is enabled.
+        is_fsdp_xla_v2_enabled (`bool`): Whether FSDP XLA v2 is enabled.
         output_dir (`str`, *optional*): The directory to save to. Defaults to `args.output_dir`.
     """
     import torch_xla.core.xla_model as xm
 
+    from ..modeling_utils import unwrap_model
+
     output_dir = output_dir if output_dir is not None else args.output_dir
 
     logger.info(f"Saving model checkpoint to {output_dir}")
@@ -219,15 +224,16 @@ def save_tpu_checkpoint(model, args, accelerator, processing_class, is_fsdp_xla_
                 logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
                 xm.save(full_state_dict, os.path.join(output_dir, WEIGHTS_NAME))
     elif not isinstance(model, supported_classes):
-        if isinstance(accelerator.unwrap_model(model), supported_classes):
-            accelerator.unwrap_model(model).save_pretrained(
+        unwrapped_model = unwrap_model(model, recursive=is_fsdp_xla_v2_enabled)
+        if isinstance(unwrapped_model, supported_classes):
+            unwrapped_model.save_pretrained(
                 output_dir,
                 is_main_process=args.should_save,
-                state_dict=xm._maybe_convert_to_cpu(model.state_dict()),
+                state_dict=xm._maybe_convert_to_cpu(unwrapped_model.state_dict()),
             )
         else:
             logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
-            state_dict = xm._maybe_convert_to_cpu(model.state_dict())
+            state_dict = xm._maybe_convert_to_cpu(unwrapped_model.state_dict())
             xm.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
     else:
         model.save_pretrained(
diff --git a/src/transformers/loss/loss_for_object_detection.py b/src/transformers/loss/loss_for_object_detection.py
index 52b43f779f35..79469785827d 100644
--- a/src/transformers/loss/loss_for_object_detection.py
+++ b/src/transformers/loss/loss_for_object_detection.py
@@ -31,7 +31,7 @@
     from transformers.image_transforms import center_to_corners_format
 
 
-def dice_loss(inputs, targets, num_boxes):
+def dice_loss(inputs, targets, num_boxes, valid_mask=None):
     """
     Compute the DICE loss, similar to generalized IOU for masks
 
@@ -41,16 +41,25 @@ def dice_loss(inputs, targets, num_boxes):
         targets: A float tensor with the same shape as inputs. Stores the binary
                  classification label for each element in inputs (0 for the negative class and 1 for the positive
                  class).
+        valid_mask: Optional boolean tensor with the same shape as inputs.
+                    If provided, only valid (non-padding) areas are considered in the loss.
+                    True means valid, False means padding.
     """
     inputs = inputs.sigmoid()
     inputs = inputs.flatten(1)
+
+    if valid_mask is not None:
+        valid_mask = valid_mask.flatten(1).to(dtype=inputs.dtype)
+        inputs = inputs * valid_mask
+        targets = targets * valid_mask
+
     numerator = 2 * (inputs * targets).sum(1)
     denominator = inputs.sum(-1) + targets.sum(-1)
     loss = 1 - (numerator + 1) / (denominator + 1)
     return loss.sum() / num_boxes
 
 
-def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, valid_mask=None):
     """
     Loss used in RetinaNet for dense detection: https://huggingface.co/papers/1708.02002.
 
@@ -64,6 +73,9 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
             Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
         gamma (`int`, *optional*, defaults to `2`):
             Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+        valid_mask: Optional boolean tensor with the same shape as inputs.
+                    If provided, only valid (non-padding) areas are considered in the loss.
+                    True means valid, False means padding.
 
     Returns:
         Loss tensor
@@ -78,6 +90,13 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
         loss = alpha_t * loss
 
+    if valid_mask is not None:
+        valid_mask = valid_mask.flatten(1).to(dtype=loss.dtype)
+        loss = loss * valid_mask
+        # Average only over valid pixels per sample
+        valid_count = valid_mask.sum(1).clamp(min=1)
+        return (loss.sum(1) / valid_count).sum() / num_boxes
+
     return loss.mean(1).sum() / num_boxes
 
 
@@ -193,11 +212,16 @@ def loss_masks(self, outputs, targets, indices, num_boxes):
         source_masks = outputs["pred_masks"]
         source_masks = source_masks[source_idx]
         masks = [t["masks"] for t in targets]
-        # TODO use valid to mask invalid areas due to padding in loss
         target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
         target_masks = target_masks.to(source_masks)
         target_masks = target_masks[target_idx]
 
+        # Get valid mask for selected targets (invert: True = valid, False = padding)
+        # valid has shape (batch, h, w), we need to index by batch indices only
+        batch_idx = target_idx[0]
+        valid_mask = ~valid
+        valid_mask = valid_mask[batch_idx]
+
         # upsample predictions to the target size
         source_masks = nn.functional.interpolate(
             source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
@@ -206,9 +230,12 @@ def loss_masks(self, outputs, targets, indices, num_boxes):
 
         target_masks = target_masks.flatten(1)
         target_masks = target_masks.view(source_masks.shape)
+        valid_mask = valid_mask.flatten(1)
+        valid_mask = valid_mask.view(source_masks.shape)
+
         losses = {
-            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
-            "loss_dice": dice_loss(source_masks, target_masks, num_boxes),
+            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes, valid_mask=valid_mask),
+            "loss_dice": dice_loss(source_masks, target_masks, num_boxes, valid_mask=valid_mask),
         }
         return losses
 
diff --git a/src/transformers/loss/loss_rt_detr.py b/src/transformers/loss/loss_rt_detr.py
index cf6d6ad05940..69dc1ff67600 100644
--- a/src/transformers/loss/loss_rt_detr.py
+++ b/src/transformers/loss/loss_rt_detr.py
@@ -270,6 +270,12 @@ def loss_masks(self, outputs, targets, indices, num_boxes):
         target_masks = target_masks.to(source_masks)
         target_masks = target_masks[target_idx]
 
+        # Get valid mask for selected targets (invert: True = valid, False = padding)
+        # valid has shape (batch, h, w), we need to index by batch indices only
+        batch_idx = target_idx[0]
+        valid_mask = ~valid
+        valid_mask = valid_mask[batch_idx]
+
         # upsample predictions to the target size
         source_masks = nn.functional.interpolate(
             source_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False
@@ -278,9 +284,12 @@ def loss_masks(self, outputs, targets, indices, num_boxes):
 
         target_masks = target_masks.flatten(1)
         target_masks = target_masks.view(source_masks.shape)
+        valid_mask = valid_mask.flatten(1)
+        valid_mask = valid_mask.view(source_masks.shape)
+
         losses = {
-            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes),
-            "loss_dice": dice_loss(source_masks, target_masks, num_boxes),
+            "loss_mask": sigmoid_focal_loss(source_masks, target_masks, num_boxes, valid_mask=valid_mask),
+            "loss_dice": dice_loss(source_masks, target_masks, num_boxes, valid_mask=valid_mask),
         }
         return losses
 
diff --git a/src/transformers/loss/loss_utils.py b/src/transformers/loss/loss_utils.py
index 51564d299e55..07a75fbb57b1 100644
--- a/src/transformers/loss/loss_utils.py
+++ b/src/transformers/loss/loss_utils.py
@@ -31,10 +31,14 @@ def fixed_cross_entropy(
     target: torch.Tensor,
     num_items_in_batch: torch.Tensor | None = None,
     ignore_index: int = -100,
+    weight: torch.Tensor | None = None,
+    label_smoothing: float = 0.0,
     **kwargs,
 ) -> torch.Tensor:
     reduction = "sum" if num_items_in_batch is not None else "mean"
-    loss = nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction=reduction)
+    loss = nn.functional.cross_entropy(
+        source, target, ignore_index=ignore_index, weight=weight, reduction=reduction, label_smoothing=label_smoothing
+    )
     if reduction == "sum":
         # just in case users pass an int for num_items_in_batch, which could be the case for custom trainer
         if torch.is_tensor(num_items_in_batch):
@@ -52,9 +56,6 @@ def ForCausalLMLoss(
     shift_labels: torch.Tensor | None = None,
     **kwargs,
 ) -> torch.Tensor:
-    # Upcast to float if we need to compute the loss to avoid potential precision issues
-    logits = logits.float()
-
     if shift_labels is None:
         # Shift so that tokens < n predict n
         labels = nn.functional.pad(labels, (0, 1), value=ignore_index)
@@ -63,6 +64,13 @@ def ForCausalLMLoss(
     # Flatten the tokens
     logits = logits.view(-1, vocab_size)
     shift_labels = shift_labels.view(-1)
+    # Filter out the ignore_index labels
+    mask = shift_labels != ignore_index
+    shift_labels = shift_labels[mask]
+    logits = logits[mask.to(logits.device)]
+    # Upcast to float if we need to compute the loss to avoid potential precision issues
+    logits = logits.float()
+    # Enable model parallelism
     shift_labels = shift_labels.to(logits.device)
     loss = fixed_cross_entropy(logits, shift_labels, num_items_in_batch, ignore_index, **kwargs)
     return loss
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index e833a5a8a2ab..8a7195f13806 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -21,8 +21,7 @@
 import httpx
 import yaml
 from huggingface_hub import is_offline_mode, model_info
-from huggingface_hub.errors import OfflineModeIsEnabled
-from huggingface_hub.utils import HFValidationError
+from huggingface_hub.errors import HFValidationError, OfflineModeIsEnabled
 
 from . import __version__
 from .models.auto.modeling_auto import (
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index 32642d71d2a3..690254513618 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -328,7 +328,7 @@ def _pad_input(hidden_states, indices, batch, seqlen):
     return output.view(batch, seqlen, *dim)
 
 
-def _get_unpad_data(attention_mask: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, int]:
+def _get_unpad_data(attention_mask: torch.Tensor) -> tuple[torch.Tensor, int]:
     """
     Retrieves indexing data required to repad unpadded (ragged) tensors.
 
@@ -337,19 +337,15 @@ def _get_unpad_data(attention_mask: torch.Tensor) -> tuple[torch.Tensor, torch.T
             Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
 
     Return:
-        indices (`torch.Tensor`):
-            The indices of non-masked tokens from the flattened input sequence.
         cu_seqlens (`torch.Tensor`):
             The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
         max_seqlen_in_batch (`int`):
             Maximum sequence length in batch.
     """
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max()
     cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return (
-        indices,
         cu_seqlens,
         max_seqlen_in_batch,
     )
@@ -396,7 +392,8 @@ def _upad_input(
         (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`):
             Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
     """
-    indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+    cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+    flatten_mask = attention_mask.reshape(-1).bool()
 
     # With static caches, the k/v states may be larger than the mask -> we need to slice them to avoid generating garbage
     # It's a bit of an anti-pattern, but otherwise we silently compute wrong attentions scores
@@ -405,13 +402,15 @@ def _upad_input(
 
     batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
 
-    key_layer = _index_first_axis(key_layer, indices_k)
-    value_layer = _index_first_axis(value_layer, indices_k)
+    key_layer = _index_first_axis(key_layer, flatten_mask)
+    value_layer = _index_first_axis(value_layer, flatten_mask)
     if query_length == kv_seq_len:
-        query_layer = _index_first_axis(query_layer, indices_k)
+        query_layer = _index_first_axis(query_layer, flatten_mask)
         cu_seqlens_q = cu_seqlens_k
-        max_seqlen_in_batch_q = max_seqlen_in_batch_k
-        indices_q = indices_k
+        # NOTE: Similar to the `.item()` in prepare_fa2_from_position_ids, with torch compile,
+        # this might cause a graph break
+        max_seqlen_in_batch_q = max_seqlen_in_batch_k.item()
+        indices_q = flatten_mask.nonzero(as_tuple=False).flatten()
     elif query_length == 1:
         max_seqlen_in_batch_q = 1
         cu_seqlens_q = torch.arange(
@@ -517,7 +516,7 @@ def _is_packed_sequence(position_ids, batch_size):
         2. Flattened sequences only are supported
         3. Compile-friendly `not (torch.diff(position_ids, dim=-1) >= 0).all()`, i.e. we have multiple increasing sequences
     """
-    if position_ids is None:
+    if is_tracing(position_ids) or position_ids is None:
         return False
 
     increasing_position_sequences = (
@@ -616,6 +615,21 @@ def _process_flash_attention_kwargs(
         flash_kwargs (`dict`):
             A dict of kwargs that are requested and supported.
     """
+
+    user_kwargs = {
+        "dropout_p": dropout,
+        "window_size": sliding_window,
+        "deterministic": deterministic,
+        "softcap": softcap,
+        "s_aux": s_aux,
+    }
+    # Note 'window_size' in supports_mapping maps to our 'sliding_window' param
+    for k, v in user_kwargs.items():
+        if not supports_mapping[k] and v is not None:
+            raise ValueError(
+                f"Parameter `{k}` is not supported by this Flash Attention implementation but was set, please use a different attentionimplementation."
+            )
+
     flash_kwargs = {
         "causal": is_causal and not (use_top_left_mask and query_length == 1),
         "softmax_scale": softmax_scale,
diff --git a/src/transformers/modeling_layers.py b/src/transformers/modeling_layers.py
index 1012606fcaaf..2aca6fda0aa3 100644
--- a/src/transformers/modeling_layers.py
+++ b/src/transformers/modeling_layers.py
@@ -102,7 +102,7 @@ def __init__(self, config):
         self.num_labels = config.num_labels
         # Similar to `self.model = AutoModel.from_config(config)` but allows to change the base model name if needed in the child class
         setattr(self, self.base_model_prefix, AutoModel.from_config(config))
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        self.score = nn.Linear(config.get_text_config().hidden_size, self.num_labels, bias=False)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -137,13 +137,13 @@ def forward(
         else:
             batch_size = inputs_embeds.shape[0]
 
-        if self.config.pad_token_id is None and batch_size != 1:
+        if self.config.get_text_config().pad_token_id is None and batch_size != 1:
             raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
+        if self.config.get_text_config().pad_token_id is None:
             last_non_pad_token = -1
         elif input_ids is not None:
             # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
-            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            non_pad_mask = (input_ids != self.config.get_text_config().pad_token_id).to(logits.device, torch.int32)
             token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
             last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
         else:
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index b041964bbdfc..b2ea39eeb850 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1316,6 +1316,12 @@ def __init__(self, config: PreTrainedConfig, *inputs, **kwargs):
             )
         self.config = config
         self.name_or_path = config.name_or_path
+        quant_config = getattr(config, "quantization_config", None)
+        if quant_config is not None:
+            raise NotImplementedError(
+                "Quantization via `from_config()` is not supported. "
+                "Quantized models must be created via `from_pretrained()` with an appropriate backend."
+            )
 
         # Check the attention implementation is supported, or set it if not yet set (on the internal attr, to avoid
         # setting it recursively)
@@ -1368,6 +1374,9 @@ def post_init(self):
         self._keep_in_fp32_modules_strict = set(self._keep_in_fp32_modules_strict or [])
         # Current submodel must register its `_no_split_modules` as well
         self._no_split_modules = set(self._no_split_modules or [])
+        # Current submodel must register the `_keys_to_ignore_on_load_unexpected/missing`
+        self._keys_to_ignore_on_load_unexpected = self._keys_to_ignore_on_load_unexpected or []
+        self._keys_to_ignore_on_load_missing = self._keys_to_ignore_on_load_missing or []
 
         # Iterate over children only: as the final model is created, this is enough to gather the properties from all submodels.
         # This works because the way the `__init__` and `post_init` are called on all submodules is depth-first in the graph
@@ -1390,17 +1399,40 @@ def post_init(self):
             # Record `_no_split_modules` from the children
             if no_split := getattr(module, "_no_split_modules", None):
                 self._no_split_modules.update(no_split)
+            # Record `_keys_to_ignore_on_load_unexpected/missing` from the children
+            if ignore_unexpected := getattr(module, "_keys_to_ignore_on_load_unexpected", None):
+                self._keys_to_ignore_on_load_unexpected.extend(
+                    [f"{name}.{child_name}" for child_name in ignore_unexpected]
+                )
+            if ignore_missing := getattr(module, "_keys_to_ignore_on_load_missing", None):
+                self._keys_to_ignore_on_load_missing.extend([f"{name}.{child_name}" for child_name in ignore_missing])
+
+        # Preserve the current no-tie scope on this instance so only the model
+        # being initialized in that scope skips tie_weights().
+        self._skip_tie_weights_scope = init._SKIP_TIE_WEIGHTS_SCOPE.get()
 
         # Maybe initialize the weights and tie the keys
         self.init_weights()
         self._backward_compatibility_gradient_checkpointing()
+        # Cache the list of (name, submodule) pairs where the submodule is a PreTrainedModel.
+        # This pattern is used in several places across the codebase; computing it once avoids
+        # repeated traversal of the full module tree.
+        self._named_pretrained_submodules: list[tuple[str, PreTrainedModel]] = [
+            (name, module) for name, module in self.named_modules() if isinstance(module, PreTrainedModel)
+        ]
+
+    @property
+    def has_ep(self) -> bool:
+        """Whether expert parallelism is enabled for this model."""
+        distributed_config = getattr(getattr(self, "config", None), "distributed_config", None)
+        return distributed_config is not None and getattr(distributed_config, "enable_expert_parallel", False)
 
     @property
     def tp_plan(self) -> dict[str, str]:
         """
         The full tp plan for the model's modules
         """
-        if hasattr(self.config, "distributed_config") and self.config.distributed_config.enable_expert_parallel:
+        if self.has_ep:
             return self._ep_plan
         return self._tp_plan
 
@@ -2371,14 +2403,25 @@ def _init_weights(self, module):
 
         if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.ConvTranspose1d, nn.ConvTranspose2d)):
             if getattr(module, "weight", None) is not None:
-                init.normal_(module.weight, mean=0.0, std=std)
-            if module.bias is not None:
+                if module.weight.dtype in (torch.int8, torch.uint8):
+                    logger.debug(
+                        f"Skipping weight initialization for quantized module {module.__class__.__name__} with dtype "
+                        f"{module.weight.dtype}"
+                    )
+                else:
+                    init.normal_(module.weight, mean=0.0, std=std)
+            if module.bias is not None and module.bias.dtype not in (torch.int8, torch.uint8):
                 init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
-            init.normal_(module.weight, mean=0.0, std=std)
-            # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
-            if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
-                init.zeros_(module.weight[module.padding_idx])
+            if module.weight.dtype in (torch.int8, torch.uint8):
+                logger.debug(
+                    f"Skipping weight initialization for quantized embedding with dtype {module.weight.dtype}"
+                )
+            else:
+                init.normal_(module.weight, mean=0.0, std=std)
+                # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
+                if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
+                    init.zeros_(module.weight[module.padding_idx])
         elif isinstance(module, nn.MultiheadAttention):
             # This uses torch's original init
             module._reset_parameters()
@@ -2591,6 +2634,9 @@ def tie_weights(self, missing_keys: set[str] | None = None, recompute_mapping: b
         `source` is missing in the checkpoint while `target` exists, we *swap* source and target so we can still
         tie everything to the parameter that actually exists.
         """
+        if init.should_skip_tie_weights(self):
+            return
+
         # In this case, the keys stored in `all_tied_weights_keys` are already correct
         if not recompute_mapping:
             tied_keys = self.all_tied_weights_keys
@@ -3338,8 +3384,10 @@ def save_pretrained(
             files_timestamps = self._get_files_timestamps(save_directory)
 
         metadata = {}
+        quantizer_provided_state_dict = False
         if hf_quantizer is not None:
             state_dict, metadata = hf_quantizer.get_state_dict_and_metadata(self)
+            quantizer_provided_state_dict = state_dict is not None
         metadata["format"] = "pt"
 
         # Only save the model itself if we are using distributed training
@@ -3428,7 +3476,8 @@ def save_pretrained(
         state_dict = remove_tied_weights_from_state_dict(state_dict, model_to_save)
 
         # Revert all renaming and/or weight operations
-        if save_original_format and not _hf_peft_config_loaded:
+        # Skip if the quantizer already provided the state_dict in the correct serialization format
+        if save_original_format and not _hf_peft_config_loaded and not quantizer_provided_state_dict:
             state_dict = revert_weight_conversion(model_to_save, state_dict)
 
         # Shard the model if it is too big.
@@ -3671,14 +3720,27 @@ def float(self, *args):
 
     @classmethod
     def get_init_context(
-        cls, dtype: torch.dtype, is_quantized: bool, _is_ds_init_called: bool, allow_all_kernels: bool | None
+        cls,
+        dtype: torch.dtype,
+        is_quantized: bool,
+        _is_ds_init_called: bool,
+        allow_all_kernels: bool | None,
+        distributed_config=None,
     ):
         # Need to instantiate with correct dtype
         init_contexts = [local_torch_dtype(dtype, cls.__name__), init.no_tie_weights(), apply_patches()]
         # Needed as we cannot forward the `allow_all_kernels` arg in the model's __init__
         if allow_all_kernels:
             init_contexts.append(allow_all_hub_kernels())
-        if is_deepspeed_zero3_enabled():
+        _has_ep = distributed_config is not None and getattr(distributed_config, "enable_expert_parallel", False)
+        if _has_ep and is_deepspeed_zero3_enabled():
+            # EP + DeepSpeed: use meta device (same as the normal non-DS path).
+            # zero.Init is skipped because EP needs to shard experts via distribute_model()
+            # hooks, which are incompatible with ZeRO-3 lazy parameters.
+            # The standard weight loading path (not zero3) handles EP sharding via
+            # shard_and_distribute_module. deepspeed.initialize() wraps the result later.
+            init_contexts.extend([torch.device("meta"), init.meta_device_safe_creation_ops()])
+        elif is_deepspeed_zero3_enabled():
             import deepspeed
 
             # We cannot initialize the model on meta device with deepspeed when not quantized
@@ -4086,6 +4148,12 @@ def from_pretrained(
             download_kwargs_with_commit,
             **adapter_kwargs,
         )
+        # EP + DeepSpeed: clear device_map (set by initialize_tensor_parallelism) so the model
+        # loads on CPU first. distribute_model() handles GPU placement during EP sharding.
+        # Without this, device_map triggers accelerate's dispatch path which breaks shard loading.
+        _has_ep = distributed_config is not None and getattr(distributed_config, "enable_expert_parallel", False)
+        if _has_ep and is_deepspeed_zero3_enabled():
+            device_map = None
         device_map = check_and_set_device_map(device_map)  # warn, error and fix the device map
 
         user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class}
@@ -4194,7 +4262,9 @@ def from_pretrained(
 
             register_fusion_patches(cls, config, fusion_config)
 
-        model_init_context = cls.get_init_context(dtype, is_quantized, _is_ds_init_called, allow_all_kernels)
+        model_init_context = cls.get_init_context(
+            dtype, is_quantized, _is_ds_init_called, allow_all_kernels, distributed_config
+        )
 
         config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
         with ContextManagers(model_init_context):
@@ -4327,7 +4397,11 @@ def _load_pretrained_model(
 
         error_msgs = []
 
-        if is_deepspeed_zero3_enabled() and not is_quantized:
+        # EP + DeepSpeed: skip zero3 loading path. The model was created on meta device
+        # (not via zero.Init), so params are not zero3-partitioned. The standard loading
+        # path handles EP sharding via shard_and_distribute_module using the EP plan hooks
+        # registered by distribute_model(). deepspeed.initialize() wraps the result later.
+        if is_deepspeed_zero3_enabled() and not is_quantized and not model.has_ep:
             if state_dict is None:
                 merged_state_dict = {}
                 for ckpt_file in checkpoint_files:
@@ -4646,14 +4720,12 @@ def _move_missing_keys_from_meta_to_device(
         """
         is_quantized = hf_quantizer is not None
         # This is the only case where we do not initialize the model on meta device, so we don't have to do anything here
-        if is_deepspeed_zero3_enabled() and not is_quantized:
+        # Exception: EP + DeepSpeed uses meta device (not zero.Init), so it needs the standard move path.
+        if is_deepspeed_zero3_enabled() and not is_quantized and not self.has_ep:
             return
 
-        # In this case we need to move everything back
+        # Leave parameters on meta on non-rank-0 FSDP ranks (rank-0 broadcast overwrites them); only buffers need real placeholders.
         if is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized:
-            for key, param in self.named_parameters():
-                value = torch.zeros_like(param, device="cpu")
-                _load_parameter_into_model(self, key, value)
             for key, buffer in self.named_buffers():
                 value = torch.zeros_like(buffer, device="cpu")
                 _load_parameter_into_model(self, key, value)
@@ -4704,7 +4776,7 @@ def _initialize_missing_keys(self, is_quantized: bool) -> None:
             self._is_hf_initialized = True
 
         # This will only initialize submodules that are not marked as initialized by the line above.
-        if is_deepspeed_zero3_enabled() and not is_quantized:
+        if is_deepspeed_zero3_enabled() and not is_quantized and not self.has_ep:
             import deepspeed
 
             # keep_vars=True as we need the original tensors, so that the "_is_hf_initialized" is present on them
@@ -4714,7 +4786,21 @@ def _initialize_missing_keys(self, is_quantized: bool) -> None:
             with deepspeed.zero.GatheredParameters(not_initialized_parameters, modifier_rank=0):
                 self.initialize_weights()
         else:
-            self.initialize_weights()
+            try:
+                all_params = [p for p in self.parameters() if p is not None]
+                if all_params and not any(p.dtype.is_floating_point for p in all_params):
+                    logger.info("Skipping weight initialization for quantized model (non-floating-point dtype).")
+                    skip_weight_initialization = True
+                else:
+                    skip_weight_initialization = False
+            except Exception:
+                skip_weight_initialization = False
+
+            if not skip_weight_initialization:
+                self.initialize_weights()
+            else:
+                logger.info("Weight initialization skipped.")
+
 
     def _adjust_missing_and_unexpected_keys(self, loading_info: LoadStateDictInfo) -> None:
         """Adjust the `missing_keys` and `unexpected_keys` based on current model's exception rules, to avoid
@@ -4800,7 +4886,19 @@ def get_parameter_or_buffer(self, target: str):
         ):
             return module.get_extra_state()
 
-        raise AttributeError(f"`{target}` is neither a parameter, buffer, nor extra state.")
+        def __recursive_getattr(object, attribute, *args):
+            """Recurse through a parameter name that is '.' seperated to get the attribute"""
+
+            def __getattr(object, attribute):
+                return getattr(object, attribute, *args)
+
+            return functools.reduce(__getattr, [object] + attribute.split("."))
+
+        try:
+            # get the actual tensor parameter from a possible nested list
+            return __recursive_getattr(module, param_name)
+        except AttributeError:
+            raise AttributeError(f"`{target}` is neither a parameter, buffer, nor extra state.")
 
     def named_non_persistent_buffers(
         self, recurse: bool = True, remove_duplicate: bool = True
diff --git a/src/transformers/models/afmoe/modeling_afmoe.py b/src/transformers/models/afmoe/modeling_afmoe.py
index 421119b33deb..74fc8bc03b6a 100644
--- a/src/transformers/models/afmoe/modeling_afmoe.py
+++ b/src/transformers/models/afmoe/modeling_afmoe.py
@@ -103,7 +103,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index fa15fcce3de6..85b26d160058 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -17,9 +17,11 @@
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...utils import auto_docstring
+from ..efficientnet.image_processing_efficientnet import EfficientNetImageProcessorKwargs
 
 
 class AlignProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: EfficientNetImageProcessorKwargs
     # see processing_utils.ProcessingKwargs documentation for usage.
     _defaults = {
         "text_kwargs": {
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index 6162cb29559e..2ef1a1f30213 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -125,7 +125,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
@@ -630,7 +630,7 @@ class AltCLIPPreTrainedModel(PreTrainedModel):
     config: AltCLIPConfig
     base_model_prefix = "altclip"
     input_modalities = ("image", "text")
-    _no_split_modules = ["AltCLIPTextEmbeddings", "AltCLIPEncoderLayer", "AltCLIPVisionEmbeddings"]
+    _no_split_modules = ["AltRobertaEmbeddings", "AltRobertaLayer", "AltCLIPEncoderLayer", "AltCLIPVisionEmbeddings"]
 
     supports_gradient_checkpointing = True
     _supports_sdpa = True
@@ -705,7 +705,7 @@ def __init__(self, config: AltCLIPVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = AltCLIPVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = AltCLIPEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.post_init()
@@ -742,7 +742,7 @@ def forward(
         >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs: BaseModelOutput = self.encoder(
             inputs_embeds=hidden_states,
diff --git a/src/transformers/models/altclip/modular_altclip.py b/src/transformers/models/altclip/modular_altclip.py
index fe9be6cac92f..ed36ac6e2a48 100644
--- a/src/transformers/models/altclip/modular_altclip.py
+++ b/src/transformers/models/altclip/modular_altclip.py
@@ -226,6 +226,7 @@ class AltCLIPVisionEmbeddings(CLIPVisionEmbeddings):
 
 
 class AltCLIPPreTrainedModel(CLIPPreTrainedModel):
+    _no_split_modules = ["AltRobertaEmbeddings", "AltRobertaLayer", "AltCLIPEncoderLayer", "AltCLIPVisionEmbeddings"]
     _can_record_outputs = {
         "hidden_states": AltCLIPEncoderLayer,
         "attentions": AltCLIPAttention,
diff --git a/src/transformers/models/apertus/modeling_apertus.py b/src/transformers/models/apertus/modeling_apertus.py
index 7d14dd3d14c8..af1a03c7c900 100644
--- a/src/transformers/models/apertus/modeling_apertus.py
+++ b/src/transformers/models/apertus/modeling_apertus.py
@@ -134,7 +134,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/arcee/modeling_arcee.py b/src/transformers/models/arcee/modeling_arcee.py
index 8d2d05bf2952..4e99339ca294 100644
--- a/src/transformers/models/arcee/modeling_arcee.py
+++ b/src/transformers/models/arcee/modeling_arcee.py
@@ -139,7 +139,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py
index e66b12438940..76d8459de528 100644
--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@@ -673,7 +673,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -946,9 +946,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py b/src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py
index 246e37edd729..000d786560bb 100644
--- a/src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py
+++ b/src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py
@@ -233,7 +233,7 @@ def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: AudioFlam
   --dst_dir audio-flamingo-3-hf
 ```
 
-3) Convert and push directly to the Hub (requires `huggingface-cli login` or `HF_TOKEN`):
+3) Convert and push directly to the Hub (requires `hf auth login` or `HF_TOKEN`):
 
 ```
 python src/transformers/models/audioflamingo3/convert_audioflamingo3_to_hf.py \
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 5ef09f8eb443..9f9266aadde6 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -323,7 +323,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str | os.PathLike[str],
             if kwargs.get("dtype") == "auto":
                 _ = kwargs.pop("dtype")
             # to not overwrite the quantization_config if config has a quantization_config
-            if kwargs.get("quantization_config") is not None:
+            if "quantization_config" in kwargs:
                 _ = kwargs.pop("quantization_config")
 
             config, kwargs = AutoConfig.from_pretrained(
@@ -340,7 +340,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str | os.PathLike[str],
                 kwargs["torch_dtype"] = "auto"
             if kwargs_orig.get("dtype", None) == "auto":
                 kwargs["dtype"] = "auto"
-            if kwargs_orig.get("quantization_config", None) is not None:
+            if "quantization_config" in kwargs_orig:
                 kwargs["quantization_config"] = kwargs_orig["quantization_config"]
 
         has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index c624f49083d2..98447b6d1724 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -583,8 +583,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
                 feature_extractor_auto_map = config_dict["auto_map"]["AutoFeatureExtractor"]
                 image_processor_auto_map = feature_extractor_auto_map.replace("FeatureExtractor", "ImageProcessor")
 
-        # If not in image processor config, try the model config
-        if image_processor_type is None and image_processor_auto_map is None:
+        # If not in image processor config, try the model config (override image_processor_auto_map if trust_remote_code is False)
+        if image_processor_type is None and (image_processor_auto_map is None or trust_remote_code is False):
             if not isinstance(config, PreTrainedConfig):
                 config = AutoConfig.from_pretrained(
                     pretrained_model_name_or_path,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index a541f13499b7..0ddf436c42d2 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -704,6 +704,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("ministral", "MinistralForCausalLM"),
         ("ministral3", "Ministral3ForCausalLM"),
         ("mistral", "MistralForCausalLM"),
+        ("mistral4", "Mistral4ForCausalLM"),
         ("mixtral", "MixtralForCausalLM"),
         ("mllama", "MllamaForCausalLM"),
         ("modernbert-decoder", "ModernBertDecoderForCausalLM"),
@@ -1217,6 +1218,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
     [
         ("cohere_asr", "CohereAsrForConditionalGeneration"),
         ("dia", "DiaForConditionalGeneration"),
+        ("glmasr", "GlmAsrForConditionalGeneration"),
         ("granite_speech", "GraniteSpeechForConditionalGeneration"),
         ("kyutai_speech_to_text", "KyutaiSpeechToTextForConditionalGeneration"),
         ("moonshine", "MoonshineForConditionalGeneration"),
@@ -1329,7 +1331,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("qwen2_moe", "Qwen2MoeForSequenceClassification"),
         ("qwen3", "Qwen3ForSequenceClassification"),
         ("qwen3_5", "Qwen3_5ForSequenceClassification"),
-        ("qwen3_5_text", "Qwen3_5ForSequenceClassification"),
+        ("qwen3_5_text", "Qwen3_5TextForSequenceClassification"),
         ("qwen3_moe", "Qwen3MoeForSequenceClassification"),
         ("qwen3_next", "Qwen3NextForSequenceClassification"),
         ("reformer", "ReformerForSequenceClassification"),
@@ -1688,6 +1690,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         # Model for Text-To-Waveform mapping
         ("bark", "BarkModel"),
         ("csm", "CsmForConditionalGeneration"),
+        ("dia", "DiaForConditionalGeneration"),
         ("fastspeech2_conformer_with_hifigan", "FastSpeech2ConformerWithHifiGan"),
         ("higgs_audio_v2", "HiggsAudioV2ForConditionalGeneration"),
         ("musicgen", "MusicgenForConditionalGeneration"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index bb0c13f7dbcc..169691e45a1c 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -172,6 +172,7 @@
         ("led", "LEDTokenizer" if is_tokenizers_available() else None),
         ("lighton_ocr", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
         ("lilt", "RobertaTokenizer" if is_tokenizers_available() else None),
+        ("llama", "LlamaTokenizer" if is_tokenizers_available() else None),
         ("longformer", "RobertaTokenizer" if is_tokenizers_available() else None),
         ("luke", "LukeTokenizer"),
         ("lxmert", "LxmertTokenizer" if is_tokenizers_available() else None),
@@ -822,6 +823,11 @@ def from_pretrained(
 
         model_type = config_class_to_model_type(type(config).__name__) or getattr(config, "model_type", None)
         if model_type is not None:
+            if model_type == "voxtral" and not is_mistral_common_available():
+                raise ImportError(
+                    "The Voxtral tokenizer requires the 'mistral-common' package. "
+                    "Use `pip install mistral-common` to install the package."
+                )
             tokenizer_class = TOKENIZER_MAPPING.get(type(config), TokenizersBackend)
             if tokenizer_class is not None:
                 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
diff --git a/src/transformers/models/auto/video_processing_auto.py b/src/transformers/models/auto/video_processing_auto.py
index ae5b6f8b9ed3..49db6d3efc9a 100644
--- a/src/transformers/models/auto/video_processing_auto.py
+++ b/src/transformers/models/auto/video_processing_auto.py
@@ -80,6 +80,8 @@
 
 def video_processor_class_from_name(class_name: str):
     for module_name, extractor in VIDEO_PROCESSOR_MAPPING_NAMES.items():
+        if extractor is None:
+            continue
         if class_name == extractor:
             module_name = model_type_to_module_name(module_name)
 
diff --git a/src/transformers/models/aya_vision/modeling_aya_vision.py b/src/transformers/models/aya_vision/modeling_aya_vision.py
index 38f434d405a5..f74f1a85677a 100644
--- a/src/transformers/models/aya_vision/modeling_aya_vision.py
+++ b/src/transformers/models/aya_vision/modeling_aya_vision.py
@@ -227,9 +227,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py
index 90188519aba7..83876f2fc46c 100644
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@@ -18,9 +18,11 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
+from ..got_ocr2.image_processing_got_ocr2 import GotOcr2ImageProcessorKwargs
 
 
 class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: GotOcr2ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding_side": "left",
diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py
index 90129fc998b1..3782da89ce24 100644
--- a/src/transformers/models/bamba/modeling_bamba.py
+++ b/src/transformers/models/bamba/modeling_bamba.py
@@ -132,7 +132,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index 53053f644539..a95c8e9752be 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -127,9 +127,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/beit/image_processing_pil_beit.py b/src/transformers/models/beit/image_processing_pil_beit.py
index e3ccf12e909b..ff78dac96c40 100644
--- a/src/transformers/models/beit/image_processing_pil_beit.py
+++ b/src/transformers/models/beit/image_processing_pil_beit.py
@@ -120,10 +120,10 @@ def _preprocess_image_like_inputs(
 
     def reduce_label(self, image: np.ndarray) -> np.ndarray:
         """Reduce label values by 1, replacing 0 with 255."""
-        # Avoid using underflow conversion
-        image[image == 0] = 255
-        image = image - 1
-        image[image == 254] = 255
+        image = image.copy()
+        ignore_mask = (image == 0) | (image == 255)
+        image[ignore_mask] = 255
+        image[~ignore_mask] = image[~ignore_mask] - 1
         return image
 
     def _preprocess(
diff --git a/src/transformers/models/bitnet/modeling_bitnet.py b/src/transformers/models/bitnet/modeling_bitnet.py
index 14c1581b250f..6f6f969e9b6b 100644
--- a/src/transformers/models/bitnet/modeling_bitnet.py
+++ b/src/transformers/models/bitnet/modeling_bitnet.py
@@ -318,7 +318,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index c5c022d39066..12cc101356d7 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1240,7 +1240,7 @@ def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch
         else:
             special_image_mask = input_ids == self.config.image_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask
 
     @can_return_tuple
@@ -1686,7 +1686,7 @@ def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch
         else:
             special_image_mask = input_ids == self.config.image_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask
 
     @can_return_tuple
@@ -1913,7 +1913,7 @@ def generate(
         else:
             special_image_mask = input_ids == self.config.image_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
         inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
 
@@ -2054,13 +2054,7 @@ def forward(
 
         if use_image_text_matching_head:
             query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
-            if self.config.image_token_index is not None:
-                input_ids = input_ids[:, self.config.num_query_tokens :]
-            else:
-                query_attention_mask = torch.ones(
-                    query_tokens.size()[:-1], dtype=torch.long, device=query_tokens.device
-                )
-                attention_mask = torch.cat([query_attention_mask, attention_mask], dim=1)
+            input_ids = input_ids[:, self.config.num_query_tokens :]
 
             query_embeds = self.embeddings(
                 input_ids=input_ids,
@@ -2092,9 +2086,8 @@ def forward(
             image_embeds = query_outputs[0] if not return_dict else query_outputs.last_hidden_state
             image_embeds = image_embeds.to(dtype=self.vision_projection.weight.dtype)
 
-            if self.config.image_token_index is not None:
-                input_ids = input_ids[:, self.config.num_query_tokens :]
-                attention_mask = attention_mask[:, self.config.num_query_tokens :]
+            input_ids = input_ids[:, self.config.num_query_tokens :]
+            attention_mask = attention_mask[:, self.config.num_query_tokens :]
 
             query_embeds = self.embeddings(
                 input_ids=input_ids,
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index e339854a6736..c8feeed2b822 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -77,8 +77,16 @@ def __call__(
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         max_length = output_kwargs["text_kwargs"].pop("max_length", None)
         if max_length is not None:
-            output_kwargs["text_kwargs"]["max_length"] = max_length - self.num_query_tokens
-
+            num_query_tokens = self.num_query_tokens
+            if num_query_tokens is None:
+                logger.warning(
+                    "Blip2Processor.num_query_tokens is None. Treating it as 0 for max_length calculations. "
+                    "Consider updating the processor to set num_query_tokens explicitly."
+                )
+                num_query_tokens = 0
+            adjusted_max_length = max_length - num_query_tokens
+            if adjusted_max_length > 0:
+                output_kwargs["text_kwargs"]["max_length"] = adjusted_max_length
         encoding = BatchFeature(tensor_type=return_tensors)
         if text is not None:
             if isinstance(text, str):
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index 225289d8367e..d5d1b1f03a7e 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -820,7 +820,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
index aa0ea7b4c4da..9424362e519c 100644
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -17,9 +17,11 @@
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...utils import auto_docstring
+from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs
 
 
 class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: BridgeTowerImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index 9d10a8aeaef1..c47245a0ae2b 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -106,7 +106,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py
index af69779959e4..fe3243f01dc8 100644
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@@ -143,7 +143,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -911,9 +911,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 3c2ddef2e7a4..99828afbda36 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -517,7 +517,12 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel):
     config: ChineseCLIPConfig
     base_model_prefix = "chinese_clip"
     input_modalities = ("image", "text")
-    _no_split_modules = ["ChineseCLIPVisionEmbeddings", "ChineseCLIPTextEmbeddings", "ChineseCLIPVisionAttention"]
+    _no_split_modules = [
+        "ChineseCLIPVisionEmbeddings",
+        "ChineseCLIPTextEmbeddings",
+        "ChineseCLIPTextLayer",
+        "ChineseCLIPVisionAttention",
+    ]
 
     supports_gradient_checkpointing = True
     _supports_sdpa = True
@@ -653,7 +658,7 @@ def __init__(self, config: ChineseCLIPVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = ChineseCLIPVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = ChineseCLIPVisionEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.post_init()
@@ -690,7 +695,7 @@ def forward(
         >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs: BaseModelOutput = self.encoder(
             inputs_embeds=hidden_states,
diff --git a/src/transformers/models/chinese_clip/modular_chinese_clip.py b/src/transformers/models/chinese_clip/modular_chinese_clip.py
index 280cb7bd54ae..bb6b05f9ac92 100644
--- a/src/transformers/models/chinese_clip/modular_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modular_chinese_clip.py
@@ -197,7 +197,12 @@ class ChineseCLIPTextPooler(BertPooler):
 
 @auto_docstring
 class ChineseCLIPPreTrainedModel(CLIPPreTrainedModel):
-    _no_split_modules = ["ChineseCLIPVisionEmbeddings", "ChineseCLIPTextEmbeddings", "ChineseCLIPVisionAttention"]
+    _no_split_modules = [
+        "ChineseCLIPVisionEmbeddings",
+        "ChineseCLIPTextEmbeddings",
+        "ChineseCLIPTextLayer",
+        "ChineseCLIPVisionAttention",
+    ]
     _can_record_outputs = {
         "hidden_states": ChineseCLIPVisionLayer,
         "attentions": ChineseCLIPVisionAttention,
diff --git a/src/transformers/models/chmv2/image_processing_chmv2.py b/src/transformers/models/chmv2/image_processing_chmv2.py
index 3bb82b2dea53..067ba5898734 100644
--- a/src/transformers/models/chmv2/image_processing_chmv2.py
+++ b/src/transformers/models/chmv2/image_processing_chmv2.py
@@ -182,9 +182,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/chmv2/modular_chmv2.py b/src/transformers/models/chmv2/modular_chmv2.py
index f61c6687a351..5f44654876c6 100644
--- a/src/transformers/models/chmv2/modular_chmv2.py
+++ b/src/transformers/models/chmv2/modular_chmv2.py
@@ -150,6 +150,17 @@ class CHMv2ImageProcessor(DPTImageProcessor):
     image_std = [0.213, 0.156, 0.143]
     valid_kwargs = CHMv2ImageProcessorKwargs
 
+    def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
+        """Reduce label values by 1, replacing 0 with 255."""
+        for idx in range(len(labels)):
+            label = labels[idx]
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
+            labels[idx] = label
+        return labels
+
     def post_process_depth_estimation(
         self,
         outputs: "DepthEstimatorOutput",
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 96c540a3424f..cf766d53a261 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -990,7 +990,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 2bca67e59a21..daeca0a502b1 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -481,15 +481,18 @@ def forward(
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutput:
         hidden_states = inputs_embeds
+        all_hidden_states = [hidden_states] if self.config.output_hidden_states else None
         for encoder_layer in self.layers:
             hidden_states = encoder_layer(
                 hidden_states,
                 attention_mask,
                 **kwargs,
             )
+            if all_hidden_states:
+                all_hidden_states.append(hidden_states)
 
         return BaseModelOutput(
-            last_hidden_state=hidden_states,
+            last_hidden_state=hidden_states, hidden_states=tuple(all_hidden_states) if all_hidden_states else None
         )
 
 
@@ -609,7 +612,7 @@ def __init__(self, config: CLIPVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = CLIPVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = CLIPEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.post_init()
@@ -646,7 +649,7 @@ def forward(
         >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs: BaseModelOutput = self.encoder(
             inputs_embeds=hidden_states,
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index cf17b44b00c2..a462bdc7ef40 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -708,7 +708,7 @@ def __init__(self, config: CLIPSegVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = CLIPSegVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = CLIPSegEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.post_init()
@@ -745,7 +745,7 @@ def forward(
         >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs: BaseModelOutput = self.encoder(
             inputs_embeds=hidden_states,
diff --git a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
index 048412b383e7..c1915dfcea46 100644
--- a/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/modeling_cohere2_vision.py
@@ -188,9 +188,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
index 7d76f1187733..bd65e67aa1f9 100644
--- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
@@ -18,9 +18,11 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
+from .image_processing_cohere2_vision_fast import Cohere2VisionFastImageProcessorKwargs
 
 
 class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Cohere2VisionFastImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding_side": "left",
diff --git a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
index 1192be10606d..42f4bf3117da 100644
--- a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
+++ b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
@@ -284,17 +284,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech.to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/colqwen2/modeling_colqwen2.py b/src/transformers/models/colqwen2/modeling_colqwen2.py
index 656ad6c758c5..795e6679bc7a 100644
--- a/src/transformers/models/colqwen2/modeling_colqwen2.py
+++ b/src/transformers/models/colqwen2/modeling_colqwen2.py
@@ -165,9 +165,7 @@ def forward(
 
             if pixel_values is not None:
                 image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw, return_dict=True).pooler_output
-                image_mask = (
-                    (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
-                )
+                image_mask = (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1)
                 image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
 
diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py
index aa7a3f48ca6e..8394607a08de 100644
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@@ -304,9 +304,7 @@ def forward(
 
             if pixel_values is not None:
                 image_embeds = self.vlm.visual(pixel_values, grid_thw=image_grid_thw, return_dict=True).pooler_output
-                image_mask = (
-                    (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
-                )
+                image_mask = (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1)
                 image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
 
diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py
index 48af99206afe..89b737bd5009 100644
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@@ -29,9 +29,11 @@
 
 if is_torch_available():
     import torch
+from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs
 
 
 class ColQwen2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen2VLImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": "longest",
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 539fe152f606..6dd8c22ff207 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -440,8 +440,13 @@ def __init__(self, **kwargs: Unpack[ConditionalDetrImageProcessorKwargs]) -> Non
         kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad))
 
         size = kwargs.pop("size", None)
-        max_size = None if size is None else kwargs.pop("max_size", 1333)
-        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        max_size = kwargs.pop("max_size", None)
+
+        if size is None:
+            size = {"shortest_edge": 800, "longest_edge": max_size if max_size is not None else 1333}
+        elif isinstance(size, dict) and max_size is not None and "longest_edge" not in size:
+            size = {**size, "longest_edge": max_size}
+
         # Convert size dict for backwards compat with max_size parameter
         kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)
 
diff --git a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
index 30740114d5f0..3f96def66064 100644
--- a/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_pil_conditional_detr.py
@@ -443,13 +443,17 @@ def __init__(self, **kwargs: Unpack[ConditionalDetrImageProcessorKwargs]) -> Non
         kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad))
 
         size = kwargs.pop("size", None)
-        max_size = None if size is None else kwargs.pop("max_size", 1333)
-        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        max_size = kwargs.pop("max_size", None)
+
+        if size is None:
+            size = {"shortest_edge": 800, "longest_edge": max_size if max_size is not None else 1333}
+        elif isinstance(size, dict) and max_size is not None and "longest_edge" not in size:
+            size = {**size, "longest_edge": max_size}
+
         # Convert size dict for backwards compat with max_size parameter
-        if size is not None:
-            from ...image_processing_utils import get_size_dict
+        from ...image_processing_utils import get_size_dict
 
-            kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)
+        kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)
 
         # Backwards compatibility
         do_convert_annotations = kwargs.get("do_convert_annotations")
diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py
index eb78dca8faf5..d6a38358f9d7 100644
--- a/src/transformers/models/csm/modeling_csm.py
+++ b/src/transformers/models/csm/modeling_csm.py
@@ -174,7 +174,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/cwm/modeling_cwm.py b/src/transformers/models/cwm/modeling_cwm.py
index 3e0eb0504be0..afffc6e41449 100644
--- a/src/transformers/models/cwm/modeling_cwm.py
+++ b/src/transformers/models/cwm/modeling_cwm.py
@@ -99,7 +99,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/dac/convert_dac_checkpoint.py b/src/transformers/models/dac/convert_dac_checkpoint.py
index b3360fc1706d..acfa4166414e 100644
--- a/src/transformers/models/dac/convert_dac_checkpoint.py
+++ b/src/transformers/models/dac/convert_dac_checkpoint.py
@@ -17,7 +17,6 @@
 
 import numpy as np
 import torch
-import torch.nn as nn
 
 from transformers import (
     DacConfig,
@@ -186,50 +185,21 @@ def recursively_load_weights(orig_dict, hf_model, model_name):
     logger.warning(f"Unused weights: {unused_weights}")
 
 
-def apply_weight_norm(model):
-    weight_norm = nn.utils.weight_norm
-
-    for layer in model.quantizer.quantizers:
-        weight_norm(layer.in_proj)
-        weight_norm(layer.out_proj)
-
-    weight_norm(model.encoder.conv1)
-    weight_norm(model.encoder.conv2)
-
-    for layer in model.encoder.block:
-        weight_norm(layer.conv1)
-        weight_norm(layer.res_unit1.conv1)
-        weight_norm(layer.res_unit1.conv2)
-        weight_norm(layer.res_unit2.conv1)
-        weight_norm(layer.res_unit2.conv2)
-        weight_norm(layer.res_unit3.conv1)
-        weight_norm(layer.res_unit3.conv2)
-
-    weight_norm(model.decoder.conv1)
-    weight_norm(model.decoder.conv2)
-
-    for layer in model.decoder.block:
-        weight_norm(layer.conv_t1)
-        weight_norm(layer.res_unit1.conv1)
-        weight_norm(layer.res_unit1.conv2)
-        weight_norm(layer.res_unit2.conv1)
-        weight_norm(layer.res_unit2.conv2)
-        weight_norm(layer.res_unit3.conv1)
-        weight_norm(layer.res_unit3.conv2)
-
-
 @torch.no_grad()
 def convert_checkpoint(
     model_name,
     checkpoint_path,
     pytorch_dump_folder_path,
-    sample_rate=16000,
     repo_id=None,
+    legacy_weight_norm=True,
 ):
-    model_dict = torch.load(checkpoint_path, "cpu", weights_only=True)
+    # NOTE: Models on Hub (https://huggingface.co/descript/models) did conversion on CPU.
+    # However, for equivalent weights after removing weight norm, conversion should be done on GPU.
+    # torch_device = "cuda"
+    torch_device = "cpu"
+    model_dict = torch.load(checkpoint_path, torch_device, weights_only=True)
 
     config = DacConfig()
-
     metadata = model_dict["metadata"]["kwargs"]
     config.encoder_hidden_size = metadata["encoder_dim"]
     config.downsampling_ratios = metadata["encoder_rates"]
@@ -239,18 +209,20 @@ def convert_checkpoint(
     config.decoder_hidden_size = metadata["decoder_dim"]
     config.upsampling_ratios = metadata["decoder_rates"]
     config.quantizer_dropout = float(metadata["quantizer_dropout"])
-    config.sampling_rate = sample_rate
+    config.sampling_rate = int(metadata["sample_rate"])
     config.hop_length = int(np.prod(config.downsampling_ratios))
 
-    model = DacModel(config)
+    model = DacModel(config).to(torch_device)
     feature_extractor = DacFeatureExtractor()
-    feature_extractor.sampling_rate = sample_rate
+    feature_extractor.sampling_rate = config.sampling_rate
+    feature_extractor.hop_length = config.hop_length
 
     original_checkpoint = model_dict["state_dict"]
 
-    apply_weight_norm(model)
+    # original model uses old weight norm function
+    model.apply_weight_norm(legacy=legacy_weight_norm)
     recursively_load_weights(original_checkpoint, model, model_name)
-    model.remove_weight_norm()
+    model.remove_weight_norm(legacy=legacy_weight_norm)
 
     model.save_pretrained(pytorch_dump_folder_path)
 
@@ -275,9 +247,14 @@ def convert_checkpoint(
     parser.add_argument(
         "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the Hugging Face hub."
     )
-    parser.add_argument("--sample_rate", default=None, type=str, help="Sample rate used by DacFeatureExtractor")
+    parser.add_argument(
+        "--legacy_weight_norm",
+        default=True,
+        type=bool,
+        help="Whether legacy weight normalization was used by original model.",
+    )
     args = parser.parse_args()
 
     convert_checkpoint(
-        args.model, args.checkpoint_path, args.pytorch_dump_folder_path, args.sample_rate, args.push_to_hub
+        args.model, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.legacy_weight_norm
     )
diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py
index 6ac46f78a4a6..d8f3f6ae5607 100644
--- a/src/transformers/models/dac/modeling_dac.py
+++ b/src/transformers/models/dac/modeling_dac.py
@@ -85,6 +85,9 @@ class DacDecoderOutput(ModelOutput):
 class Snake1d(nn.Module):
     """
     A 1-dimensional Snake activation function module.
+
+    Original version from DAC used JIT compilation: https://github.com/descriptinc/descript-audio-codec/blob/main/dac/nn/layers.py#L18-L33
+    This leads to slight differences in output.
     """
 
     def __init__(self, hidden_dim):
@@ -490,9 +493,10 @@ def _init_weights(self, module):
         elif isinstance(module, nn.Embedding):
             init.normal_(module.weight, mean=0.0, std=0.02)
 
-    def apply_weight_norm(self):
+    def apply_weight_norm(self, legacy=True):
+        # original version of DAC uses legacy weight norm
         weight_norm = nn.utils.weight_norm
-        if hasattr(nn.utils.parametrizations, "weight_norm"):
+        if hasattr(nn.utils.parametrizations, "weight_norm") and not legacy:
             weight_norm = nn.utils.parametrizations.weight_norm
 
         for layer in self.quantizer.quantizers:
@@ -523,34 +527,38 @@ def apply_weight_norm(self):
             weight_norm(layer.res_unit3.conv1)
             weight_norm(layer.res_unit3.conv2)
 
-    def remove_weight_norm(self):
+    def remove_weight_norm(self, legacy=True):
+        remove_weight_norm = nn.utils.remove_weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm") and not legacy:
+            remove_weight_norm = torch.nn.utils.parametrize.remove_parametrizations
+
         for layer in self.quantizer.quantizers:
-            nn.utils.remove_weight_norm(layer.in_proj)
-            nn.utils.remove_weight_norm(layer.out_proj)
+            remove_weight_norm(layer.in_proj, "weight")
+            remove_weight_norm(layer.out_proj, "weight")
 
-        nn.utils.remove_weight_norm(self.encoder.conv1)
-        nn.utils.remove_weight_norm(self.encoder.conv2)
+        remove_weight_norm(self.encoder.conv1, "weight")
+        remove_weight_norm(self.encoder.conv2, "weight")
 
         for layer in self.encoder.block:
-            nn.utils.remove_weight_norm(layer.conv1)
-            nn.utils.remove_weight_norm(layer.res_unit1.conv1)
-            nn.utils.remove_weight_norm(layer.res_unit1.conv2)
-            nn.utils.remove_weight_norm(layer.res_unit2.conv1)
-            nn.utils.remove_weight_norm(layer.res_unit2.conv2)
-            nn.utils.remove_weight_norm(layer.res_unit3.conv1)
-            nn.utils.remove_weight_norm(layer.res_unit3.conv2)
+            remove_weight_norm(layer.conv1, "weight")
+            remove_weight_norm(layer.res_unit1.conv1, "weight")
+            remove_weight_norm(layer.res_unit1.conv2, "weight")
+            remove_weight_norm(layer.res_unit2.conv1, "weight")
+            remove_weight_norm(layer.res_unit2.conv2, "weight")
+            remove_weight_norm(layer.res_unit3.conv1, "weight")
+            remove_weight_norm(layer.res_unit3.conv2, "weight")
 
-        nn.utils.remove_weight_norm(self.decoder.conv1)
-        nn.utils.remove_weight_norm(self.decoder.conv2)
+        remove_weight_norm(self.decoder.conv1, "weight")
+        remove_weight_norm(self.decoder.conv2, "weight")
 
         for layer in self.decoder.block:
-            nn.utils.remove_weight_norm(layer.conv_t1)
-            nn.utils.remove_weight_norm(layer.res_unit1.conv1)
-            nn.utils.remove_weight_norm(layer.res_unit1.conv2)
-            nn.utils.remove_weight_norm(layer.res_unit2.conv1)
-            nn.utils.remove_weight_norm(layer.res_unit2.conv2)
-            nn.utils.remove_weight_norm(layer.res_unit3.conv1)
-            nn.utils.remove_weight_norm(layer.res_unit3.conv2)
+            remove_weight_norm(layer.conv_t1, "weight")
+            remove_weight_norm(layer.res_unit1.conv1, "weight")
+            remove_weight_norm(layer.res_unit1.conv2, "weight")
+            remove_weight_norm(layer.res_unit2.conv1, "weight")
+            remove_weight_norm(layer.res_unit2.conv2, "weight")
+            remove_weight_norm(layer.res_unit3.conv1, "weight")
+            remove_weight_norm(layer.res_unit3.conv2, "weight")
 
 
 @auto_docstring(
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index bb3af4ecf25a..dbdaec7efdbc 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -45,6 +45,7 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, is_peft_available
+from ...utils.generic import _conv_out_length
 from .configuration_data2vec_audio import Data2VecAudioConfig
 
 
@@ -510,11 +511,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
 
         add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
@@ -1220,11 +1216,6 @@ def _get_tdnn_output_lengths(self, input_lengths: torch.LongTensor | int):
         Computes the output length of the TDNN layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return (input_length - kernel_size) // stride + 1
-
         for kernel_size in self.config.tdnn_kernel:
             input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index 512431cb3b0a..47f9866e9f4f 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -105,7 +105,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 58735fb55c0b..a820e61e1113 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -98,7 +98,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -594,7 +594,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -602,7 +602,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -619,8 +621,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py
index fe3acd9aeddd..b59f7dfcc75a 100644
--- a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py
+++ b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py
@@ -112,7 +112,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -384,7 +384,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int):
             self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
         else:
             self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
-            self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
+            self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank, eps=config.rms_norm_eps)
             self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
 
         self.kv_a_proj_with_mqa = nn.Linear(
@@ -392,7 +392,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=config.attention_bias,
         )
-        self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank)
+        self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = nn.Linear(
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
diff --git a/src/transformers/models/deepseek_v3/modular_deepseek_v3.py b/src/transformers/models/deepseek_v3/modular_deepseek_v3.py
index 2bf7d347e85d..38041e0b9707 100644
--- a/src/transformers/models/deepseek_v3/modular_deepseek_v3.py
+++ b/src/transformers/models/deepseek_v3/modular_deepseek_v3.py
@@ -189,7 +189,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int):
             self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
         else:
             self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
-            self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank)
+            self.q_a_layernorm = DeepseekV3RMSNorm(config.q_lora_rank, eps=config.rms_norm_eps)
             self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
 
         self.kv_a_proj_with_mqa = nn.Linear(
@@ -197,7 +197,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=config.attention_bias,
         )
-        self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank)
+        self.kv_a_layernorm = DeepseekV3RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = nn.Linear(
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
diff --git a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py
index c58f56ddfac0..ca2dbdb1ea8b 100644
--- a/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/modeling_deepseek_vl.py
@@ -180,9 +180,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
index 7057ff152a67..be55db718b82 100644
--- a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
@@ -24,9 +24,11 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
+from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs
 
 
 class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: DeepseekVLImageProcessorKwargs
     _defaults = {
         "text_kwargs": {"padding": False},
         "common_kwargs": {"return_tensors": "pt"},
diff --git a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
index eb85a8d02a76..83e0c656e244 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py
@@ -331,9 +331,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
@@ -373,7 +373,7 @@ def forward(
             else:
                 image_attention_mask = input_ids == self.config.image_token_id
 
-            image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+            image_attention_mask = image_attention_mask.unsqueeze(-1).to(inputs_embeds.device)
             image_embeds = self.get_image_features(pixel_values, high_res_pixel_values, return_dict=True).pooler_output
             image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1])
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
index 99d24c163562..d1567dda59d9 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -332,7 +332,7 @@ def forward(
             else:
                 image_attention_mask = input_ids == self.config.image_token_id
 
-            image_attention_mask = image_attention_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+            image_attention_mask = image_attention_mask.unsqueeze(-1).to(inputs_embeds.device)
             image_embeds = self.get_image_features(pixel_values, high_res_pixel_values, return_dict=True).pooler_output
             image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1])
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
index 7948b954b6d7..9c1f4f8c012d 100644
--- a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
@@ -23,9 +23,11 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
+from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs
 
 
 class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: DeepseekVLHybridImageProcessorKwargs
     _defaults = {
         "text_kwargs": {"padding": False},
         "common_kwargs": {"return_tensors": "pt"},
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 1bc4775255d6..246828a95756 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -284,8 +284,13 @@ def __init__(self, **kwargs: Unpack[DeformableDetrImageProcessorKwargs]) -> None
         kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad))
 
         size = kwargs.pop("size", None)
-        max_size = None if size is None else kwargs.pop("max_size", 1333)
-        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        max_size = kwargs.pop("max_size", None)
+
+        if size is None:
+            size = {"shortest_edge": 800, "longest_edge": max_size if max_size is not None else 1333}
+        elif isinstance(size, dict) and max_size is not None and "longest_edge" not in size:
+            size = {**size, "longest_edge": max_size}
+
         # Convert size dict for backwards compat with max_size parameter
         kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)
 
diff --git a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
index 9c7ccc213910..68ec02518d8b 100644
--- a/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_pil_deformable_detr.py
@@ -278,13 +278,17 @@ def __init__(self, **kwargs: Unpack[DeformableDetrImageProcessorKwargs]) -> None
         kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad))
 
         size = kwargs.pop("size", None)
-        max_size = None if size is None else kwargs.pop("max_size", 1333)
-        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        max_size = kwargs.pop("max_size", None)
+
+        if size is None:
+            size = {"shortest_edge": 800, "longest_edge": max_size if max_size is not None else 1333}
+        elif isinstance(size, dict) and max_size is not None and "longest_edge" not in size:
+            size = {**size, "longest_edge": max_size}
+
         # Convert size dict for backwards compat with max_size parameter
-        if size is not None:
-            from ...image_processing_utils import get_size_dict
+        from ...image_processing_utils import get_size_dict
 
-            kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)
+        kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)
 
         # Backwards compatibility
         do_convert_annotations = kwargs.get("do_convert_annotations")
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index e5cfa7ce14fb..ccfc0ec10216 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -438,8 +438,13 @@ def __init__(self, **kwargs: Unpack[DetrImageProcessorKwargs]) -> None:
         kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad))
 
         size = kwargs.pop("size", None)
-        max_size = None if size is None else kwargs.pop("max_size", 1333)
-        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        max_size = kwargs.pop("max_size", None)
+
+        if size is None:
+            size = {"shortest_edge": 800, "longest_edge": max_size if max_size is not None else 1333}
+        elif isinstance(size, dict) and max_size is not None and "longest_edge" not in size:
+            size = {**size, "longest_edge": max_size}
+
         # Convert size dict for backwards compat with max_size parameter
         kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)
 
diff --git a/src/transformers/models/detr/image_processing_pil_detr.py b/src/transformers/models/detr/image_processing_pil_detr.py
index 14c5769549d8..e5995c70d157 100644
--- a/src/transformers/models/detr/image_processing_pil_detr.py
+++ b/src/transformers/models/detr/image_processing_pil_detr.py
@@ -442,13 +442,17 @@ def __init__(self, **kwargs: Unpack[DetrImageProcessorKwargs]) -> None:
         kwargs.setdefault("do_pad", kwargs.pop("pad_and_return_pixel_mask", self.do_pad))
 
         size = kwargs.pop("size", None)
-        max_size = None if size is None else kwargs.pop("max_size", 1333)
-        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        max_size = kwargs.pop("max_size", None)
+
+        if size is None:
+            size = {"shortest_edge": 800, "longest_edge": max_size if max_size is not None else 1333}
+        elif isinstance(size, dict) and max_size is not None and "longest_edge" not in size:
+            size = {**size, "longest_edge": max_size}
+
         # Convert size dict for backwards compat with max_size parameter
-        if size is not None:
-            from ...image_processing_utils import get_size_dict
+        from ...image_processing_utils import get_size_dict
 
-            kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)
+        kwargs["size"] = get_size_dict(size, max_size=max_size, default_to_square=False)
 
         # Backwards compatibility
         do_convert_annotations = kwargs.get("do_convert_annotations")
diff --git a/src/transformers/models/dia/modeling_dia.py b/src/transformers/models/dia/modeling_dia.py
index 629dfd4cdb35..cc649f4459b4 100644
--- a/src/transformers/models/dia/modeling_dia.py
+++ b/src/transformers/models/dia/modeling_dia.py
@@ -193,7 +193,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py
index d80ccd572dc3..6a6703bcc50f 100644
--- a/src/transformers/models/diffllama/modeling_diffllama.py
+++ b/src/transformers/models/diffllama/modeling_diffllama.py
@@ -126,7 +126,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/doge/modeling_doge.py b/src/transformers/models/doge/modeling_doge.py
index 4aad59b52a9a..de518cba9287 100644
--- a/src/transformers/models/doge/modeling_doge.py
+++ b/src/transformers/models/doge/modeling_doge.py
@@ -128,7 +128,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -266,6 +266,7 @@ def __init__(self, config: DogeConfig, layer_idx: int | None = None):
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.keep_window_size = config.keep_window_size
+        self.is_causal = True
 
         self.q_proj = nn.Linear(
             config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
@@ -477,7 +478,7 @@ def forward(
         # sequence transformation
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, self_attn_weights = self.self_attn(
+        hidden_states, _ = self.self_attn(
             hidden_states=hidden_states,
             position_embeddings=position_embeddings,
             attention_mask=attention_mask,
@@ -493,6 +494,8 @@ def forward(
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
+        if isinstance(hidden_states, tuple):
+            hidden_states, _ = hidden_states
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
         hidden_states = self.post_attention_residual * residual + hidden_states
 
@@ -524,6 +527,9 @@ def _init_weights(self, module):
         if isinstance(module, DogeAttention):
             if hasattr(module, "A"):
                 init.zeros_(module.A)
+        elif isinstance(module, DogeCDMoE):
+            if hasattr(module, "router_gate"):
+                init.zeros_(module.router_gate.weight)
         elif isinstance(module, DogeDecoderLayer):
             if hasattr(module, "input_residual"):
                 init.ones_(module.input_residual)
diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py
index 8b78126c0a00..840390ec51b8 100644
--- a/src/transformers/models/doge/modular_doge.py
+++ b/src/transformers/models/doge/modular_doge.py
@@ -204,6 +204,7 @@ def __init__(self, config: DogeConfig, layer_idx: int | None = None):
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.keep_window_size = config.keep_window_size
+        self.is_causal = True
 
         self.q_proj = nn.Linear(
             config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
@@ -403,7 +404,7 @@ def forward(
         # sequence transformation
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, self_attn_weights = self.self_attn(
+        hidden_states, _ = self.self_attn(
             hidden_states=hidden_states,
             position_embeddings=position_embeddings,
             attention_mask=attention_mask,
@@ -419,6 +420,8 @@ def forward(
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
+        if isinstance(hidden_states, tuple):
+            hidden_states, _ = hidden_states
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
         hidden_states = self.post_attention_residual * residual + hidden_states
 
@@ -441,6 +444,9 @@ def _init_weights(self, module):
         if isinstance(module, DogeAttention):
             if hasattr(module, "A"):
                 init.zeros_(module.A)
+        elif isinstance(module, DogeCDMoE):
+            if hasattr(module, "router_gate"):
+                init.zeros_(module.router_gate.weight)
         elif isinstance(module, DogeDecoderLayer):
             if hasattr(module, "input_residual"):
                 init.ones_(module.input_residual)
diff --git a/src/transformers/models/dots1/modeling_dots1.py b/src/transformers/models/dots1/modeling_dots1.py
index 95b21258ffd5..7cfde8d1957c 100644
--- a/src/transformers/models/dots1/modeling_dots1.py
+++ b/src/transformers/models/dots1/modeling_dots1.py
@@ -125,7 +125,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 6d157f6385c0..7969cead3f21 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -192,9 +192,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/dpt/image_processing_pil_dpt.py b/src/transformers/models/dpt/image_processing_pil_dpt.py
index 6f770cac4e5f..07e711769829 100644
--- a/src/transformers/models/dpt/image_processing_pil_dpt.py
+++ b/src/transformers/models/dpt/image_processing_pil_dpt.py
@@ -180,9 +180,10 @@ def _preprocess_image_like_inputs(
 
     def reduce_label(self, image: np.ndarray) -> np.ndarray:
         """Reduce label values by 1, replacing 0 with 255."""
-        image[image == 0] = 255
-        image = image - 1
-        image[image == 254] = 255
+        image = image.copy()
+        ignore_mask = (image == 0) | (image == 255)
+        image[ignore_mask] = 255
+        image[~ignore_mask] = image[~ignore_mask] - 1
         return image
 
     def resize(
diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py
index 2481decd7aeb..8fa4fd9d9188 100644
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@@ -1188,7 +1188,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -1447,9 +1447,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py
index 598687892727..e37ce1eb337f 100644
--- a/src/transformers/models/emu3/modular_emu3.py
+++ b/src/transformers/models/emu3/modular_emu3.py
@@ -1016,9 +1016,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index f634f89ab89f..9718ec588100 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -35,11 +35,16 @@
 logger = logging.get_logger(__name__)
 
 
-DEPRECATION_WARNING = (
+# Warning about deprecated practice of passing decoder_input_ids when labels are provided
+DEPRECATED_DECODER_INPUT_IDS_WARNING = (
+    "The decoder_input_ids are created based on the labels, no need to pass them yourself anymore."
+)
+
+# Warning about v4.12.0 loss computation change - always shown when training with labels
+V4_12_LOSS_COMPUTATION_WARNING = (
     "Version v4.12.0 introduces a better way to train encoder-decoder models by computing the loss inside the"
     " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if"
-    " fine-tuning a model trained with versions anterior to 4.12.0. The decoder_input_ids are now created based on the"
-    " labels, no need to pass them yourself anymore."
+    " fine-tuning a model trained with versions anterior to 4.12.0."
 )
 
 
@@ -423,6 +428,9 @@ def forward(
             )
             if decoder_attention_mask is None:
                 decoder_attention_mask = (decoder_input_ids != self.config.pad_token_id).to(decoder_input_ids.dtype)
+        elif (labels is not None) and (decoder_input_ids is not None):
+            # User provided both labels and decoder_input_ids - this is the deprecated path
+            warnings.warn(DEPRECATED_DECODER_INPUT_IDS_WARNING, FutureWarning)
 
         # Decode
         decoder_outputs = self.decoder(
@@ -440,7 +448,8 @@ def forward(
         # Compute loss independent from decoder (as some shift the logits inside them)
         loss = None
         if labels is not None:
-            warnings.warn(DEPRECATION_WARNING, FutureWarning)
+            # Always warn about v4.12.0 loss computation change
+            warnings.warn(V4_12_LOSS_COMPUTATION_WARNING, FutureWarning)
             logits = decoder_outputs.logits
             loss_fct = CrossEntropyLoss()
             loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.view(-1))
diff --git a/src/transformers/models/eomt/modeling_eomt.py b/src/transformers/models/eomt/modeling_eomt.py
index 589b023d4db8..554b37da4b03 100644
--- a/src/transformers/models/eomt/modeling_eomt.py
+++ b/src/transformers/models/eomt/modeling_eomt.py
@@ -1109,6 +1109,14 @@ def forward(
             list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
         """
 
+        if mask_labels is not None:
+            target_device = pixel_values.device
+            mask_labels = [mask.to(target_device) for mask in mask_labels]
+
+        if class_labels is not None:
+            target_device = pixel_values.device
+            class_labels = [label.to(target_device) for label in class_labels]
+
         masks_queries_logits_per_layer, class_queries_logits_per_layer = (), ()
         attention_mask = None
 
diff --git a/src/transformers/models/eomt/modular_eomt.py b/src/transformers/models/eomt/modular_eomt.py
index e4dafa024861..9cc2d228e24e 100644
--- a/src/transformers/models/eomt/modular_eomt.py
+++ b/src/transformers/models/eomt/modular_eomt.py
@@ -455,6 +455,14 @@ def forward(
             list of tuples indicating the image index and start and end positions of patches for semantic segmentation.
         """
 
+        if mask_labels is not None:
+            target_device = pixel_values.device
+            mask_labels = [mask.to(target_device) for mask in mask_labels]
+
+        if class_labels is not None:
+            target_device = pixel_values.device
+            class_labels = [label.to(target_device) for label in class_labels]
+
         masks_queries_logits_per_layer, class_queries_logits_per_layer = (), ()
         attention_mask = None
 
diff --git a/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py
index 9c106a90010d..bfa8c79b6bab 100644
--- a/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py
+++ b/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py
@@ -605,7 +605,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -613,13 +613,17 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
-        batch_size, sequence_length = attention_mask.shape
-        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        num_hidden_layers = len(gate_logits)
+        batch_size = attention_mask.shape[0]
+        sequence_length = gate_logits[0].shape[0] // batch_size
+        attention_mask = attention_mask[:, -sequence_length:]
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
         expert_attention_mask = (
@@ -630,8 +634,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py
index f3d7bc590f5d..0a26d9796c04 100644
--- a/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py
@@ -600,7 +600,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1334,18 +1334,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
@@ -1517,7 +1517,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -1525,7 +1525,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -1542,8 +1544,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py b/src/transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py
index 8413907ef3c2..5eab0158452f 100644
--- a/src/transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py
+++ b/src/transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py
@@ -22,9 +22,11 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...video_utils import VideoInput
+from .image_processing_ernie4_5_vl_moe import Ernie4_5_VLMoeImageProcessorKwargs
 
 
 class Ernie4_5_VLMoeProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Ernie4_5_VLMoeImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py
index 74fd137882d2..4efa653779ea 100644
--- a/src/transformers/models/evolla/modeling_evolla.py
+++ b/src/transformers/models/evolla/modeling_evolla.py
@@ -1086,7 +1086,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py
index f29cab8dd8ea..f53f2ce2d05d 100644
--- a/src/transformers/models/exaone4/configuration_exaone4.py
+++ b/src/transformers/models/exaone4/configuration_exaone4.py
@@ -98,15 +98,26 @@ class Exaone4Config(PreTrainedConfig):
     layer_types: list[str] | None = None
 
     def __post_init__(self, **kwargs):
-        if self.sliding_window is None:
-            self.sliding_window_pattern = 0
         if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention"
-                if ((i + 1) % (self.sliding_window_pattern) != 0 and i < self.num_hidden_layers)
-                else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
+            if self.sliding_window in (None, 0):
+                self.layer_types = ["full_attention"] * self.num_hidden_layers
+            elif isinstance(self.sliding_window_pattern, str) and self.sliding_window_pattern:
+                layer_pattern = [
+                    "sliding_attention" if layer_type.upper() == "L" else "full_attention"
+                    for layer_type in self.sliding_window_pattern
+                ]
+                self.layer_types = [
+                    layer_pattern[i % len(layer_pattern)] for i in range(self.num_hidden_layers - 1)
+                ] + ["full_attention"]
+            else:
+                repeat_period = self.sliding_window_pattern if isinstance(self.sliding_window_pattern, int) else 1
+                repeat_period = max(repeat_period, 1)
+                self.layer_types = [
+                    "sliding_attention"
+                    if ((i + 1) % repeat_period != 0 and i < self.num_hidden_layers - 1)
+                    else "full_attention"
+                    for i in range(self.num_hidden_layers)
+                ]
 
         super().__post_init__(**kwargs)
 
diff --git a/src/transformers/models/exaone4/modeling_exaone4.py b/src/transformers/models/exaone4/modeling_exaone4.py
index fab10b9b6937..2009ee162f7d 100644
--- a/src/transformers/models/exaone4/modeling_exaone4.py
+++ b/src/transformers/models/exaone4/modeling_exaone4.py
@@ -124,7 +124,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py
index c6d9202170a0..89ccb28a90bd 100644
--- a/src/transformers/models/exaone4/modular_exaone4.py
+++ b/src/transformers/models/exaone4/modular_exaone4.py
@@ -127,15 +127,26 @@ class Exaone4Config(PreTrainedConfig):
     layer_types: list[str] | None = None
 
     def __post_init__(self, **kwargs):
-        if self.sliding_window is None:
-            self.sliding_window_pattern = 0
         if self.layer_types is None:
-            self.layer_types = [
-                "sliding_attention"
-                if ((i + 1) % (self.sliding_window_pattern) != 0 and i < self.num_hidden_layers)
-                else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
+            if self.sliding_window in (None, 0):
+                self.layer_types = ["full_attention"] * self.num_hidden_layers
+            elif isinstance(self.sliding_window_pattern, str) and self.sliding_window_pattern:
+                layer_pattern = [
+                    "sliding_attention" if layer_type.upper() == "L" else "full_attention"
+                    for layer_type in self.sliding_window_pattern
+                ]
+                self.layer_types = [
+                    layer_pattern[i % len(layer_pattern)] for i in range(self.num_hidden_layers - 1)
+                ] + ["full_attention"]
+            else:
+                repeat_period = self.sliding_window_pattern if isinstance(self.sliding_window_pattern, int) else 1
+                repeat_period = max(repeat_period, 1)
+                self.layer_types = [
+                    "sliding_attention"
+                    if ((i + 1) % repeat_period != 0 and i < self.num_hidden_layers - 1)
+                    else "full_attention"
+                    for i in range(self.num_hidden_layers)
+                ]
 
         super().__post_init__(**kwargs)
 
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index 016b3209b6b1..89141439e668 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -157,7 +157,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -280,15 +280,15 @@ def _split_heads(self, fused_qkv: torch.Tensor) -> tuple[torch.Tensor, torch.Ten
             return query, key, value
         elif not self.multi_query:
             batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim)
+            fused_qkv = fused_qkv.view(batch_size, seq_length, -1, 3, self.head_dim)
             return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
         else:
             batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
-            fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads + 2, self.head_dim)
+            fused_qkv = fused_qkv.view(batch_size, seq_length, -1, self.head_dim)
             return fused_qkv[..., :-2, :], fused_qkv[..., [-2], :], fused_qkv[..., [-1], :]
 
     # Copied from transformers.models.bloom.modeling_bloom.BloomAttention._merge_heads
-    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+    def _merge_heads(self, x: torch.Tensor, tp_aware_num_heads: int) -> torch.Tensor:
         """
         Merge heads together over the last dimension
 
@@ -301,17 +301,17 @@ def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         # What we want to achieve is:
         # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
         batch_size_and_num_heads, seq_length, _ = x.shape
-        batch_size = batch_size_and_num_heads // self.num_heads
+        batch_size = batch_size_and_num_heads // tp_aware_num_heads
 
         # First view to decompose the batch size
         # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
-        x = x.view(batch_size, self.num_heads, seq_length, self.head_dim)
+        x = x.view(batch_size, tp_aware_num_heads, seq_length, self.head_dim)
 
         # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
         x = x.permute(0, 2, 1, 3)
 
         # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
-        return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim)
+        return x.reshape(batch_size, seq_length, tp_aware_num_heads * self.head_dim)
 
     def forward(
         self,
@@ -326,15 +326,20 @@ def forward(
         **kwargs,
     ):
         fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-        num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads
         # 3 x [batch_size, seq_length, num_heads, head_dim]
         (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
 
         batch_size, query_length, _, _ = query_layer.shape
 
-        query_layer = query_layer.transpose(1, 2).reshape(batch_size, self.num_heads, query_length, self.head_dim)
-        key_layer = key_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
-        value_layer = value_layer.transpose(1, 2).reshape(batch_size, num_kv_heads, query_length, self.head_dim)
+        tp_aware_num_heads = query_layer.shape[2]
+        tp_aware_key_heads = key_layer.shape[2]
+        tp_aware_value_heads = value_layer.shape[2]
+
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size, tp_aware_num_heads, query_length, self.head_dim)
+        key_layer = key_layer.transpose(1, 2).reshape(batch_size, tp_aware_key_heads, query_length, self.head_dim)
+        value_layer = value_layer.transpose(1, 2).reshape(
+            batch_size, tp_aware_value_heads, query_length, self.head_dim
+        )
 
         if alibi is None:
             cos, sin = position_embeddings
@@ -369,9 +374,9 @@ def forward(
                 # It is unclear why dropout is not applied here (while it is with alibi).
                 attn_output = attention_scores @ value_layer
 
-            attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
+            attn_output = attn_output.view(batch_size, tp_aware_num_heads, query_length, self.head_dim)
             attn_output = attn_output.permute(0, 2, 1, 3)
-            attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+            attn_output = attn_output.reshape(batch_size, query_length, tp_aware_num_heads * self.head_dim)
 
             attn_output = self.dense(attn_output)
 
@@ -392,14 +397,14 @@ def forward(
                 )
                 attention_probs = None
                 attn_output = attn_output.transpose(1, 2)
-                attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+                attn_output = attn_output.reshape(batch_size, query_length, tp_aware_num_heads * self.head_dim)
 
                 attn_output = self.dense(attn_output)
             else:
                 matmul_result = query_layer @ key_layer.transpose(-1, -2)
 
                 # change view to [batch_size, num_heads, q_length, kv_length]
-                attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
+                attention_scores = matmul_result.view(batch_size, tp_aware_num_heads, query_length, kv_length)
 
                 # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
                 input_dtype = attention_scores.dtype
@@ -407,20 +412,22 @@ def forward(
                 if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
                     attention_scores = attention_scores.to(torch.float32)
 
-                attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
+                attention_logits = attention_scores + alibi.view(batch_size, tp_aware_num_heads, 1, -1)
                 attention_logits *= self.inv_norm_factor
                 attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype)
                 # [batch_size, num_heads, q_length, kv_length]
                 attention_probs = self.attention_dropout(attention_probs)
 
                 # change view [batch_size, num_heads, q_length, kv_length]
-                attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
+                attention_probs_reshaped = attention_probs.view(
+                    batch_size, tp_aware_num_heads, query_length, kv_length
+                )
 
                 # matmul: [batch_size * num_heads, q_length, head_dim]
                 attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1)
 
                 # change view [batch_size, q_length, num_heads * head_dim]
-                attn_output = self._merge_heads(attn_output)
+                attn_output = self._merge_heads(attn_output, tp_aware_num_heads)
 
                 attn_output = self.dense(attn_output)
 
@@ -771,7 +778,7 @@ def forward(
             attention_mask=attention_mask,
             past_key_values=past_key_values,
             # Force mask creation for alibi
-            and_mask_function=lambda *args: torch.tensor(True, dtype=torch.bool),
+            and_mask_function=(lambda *args: torch.tensor(True, dtype=torch.bool)) if self.use_alibi else None,
         )
         if alibi is not None and causal_mask is not None and causal_mask.ndim == 4:
             min_dtype = torch.finfo(inputs_embeds.dtype).min
diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py
index 37b5da9df4b3..9e281c1b1c0b 100644
--- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py
+++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py
@@ -110,7 +110,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/fast_vlm/modeling_fast_vlm.py b/src/transformers/models/fast_vlm/modeling_fast_vlm.py
index 85c2eeb82b64..53ff29d5b558 100644
--- a/src/transformers/models/fast_vlm/modeling_fast_vlm.py
+++ b/src/transformers/models/fast_vlm/modeling_fast_vlm.py
@@ -162,9 +162,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/flex_olmo/modeling_flex_olmo.py b/src/transformers/models/flex_olmo/modeling_flex_olmo.py
index 100e6fa35554..5967e27b691a 100644
--- a/src/transformers/models/flex_olmo/modeling_flex_olmo.py
+++ b/src/transformers/models/flex_olmo/modeling_flex_olmo.py
@@ -548,7 +548,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -556,7 +556,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -573,8 +575,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py
index fd941b85ce66..6abd07dddca2 100644
--- a/src/transformers/models/florence2/modeling_florence2.py
+++ b/src/transformers/models/florence2/modeling_florence2.py
@@ -716,9 +716,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py
index df57519032b9..e38b4a099ea8 100644
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@@ -141,9 +141,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index 76287ae3a5ea..b86a4b6cb4a8 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -30,6 +30,7 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, is_torch_available, logging, requires_backends
 from ...utils.import_utils import requires
+from .image_processing_fuyu import FuyuImagesKwargs
 
 
 if is_torch_available():
@@ -56,6 +57,7 @@
 
 
 class FuyuProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: FuyuImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index c6c5a55b8790..13a6451e112f 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -154,7 +154,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
index 20673571b2d2..d7f347cd3a01 100644
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -139,7 +139,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py
index 2edd9ef5f101..50a176e4b287 100644
--- a/src/transformers/models/gemma2/modular_gemma2.py
+++ b/src/transformers/models/gemma2/modular_gemma2.py
@@ -170,7 +170,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py
index 38f50e95bb6d..247c8788cbe8 100644
--- a/src/transformers/models/gemma3/modeling_gemma3.py
+++ b/src/transformers/models/gemma3/modeling_gemma3.py
@@ -42,7 +42,7 @@
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
 from ...utils.deprecation import deprecate_kwarg
 from ...utils.generic import maybe_autocast, merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
@@ -50,9 +50,6 @@
 from .configuration_gemma3 import Gemma3Config, Gemma3TextConfig
 
 
-logger = logging.get_logger(__name__)
-
-
 @dataclass
 @auto_docstring(
     custom_intro="""
@@ -824,9 +821,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
@@ -1126,24 +1123,17 @@ def create_masks_for_generate(
         )
 
 
-class Gemma3ForSequenceClassification(Gemma3PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Gemma3Model(config)
-        self.score = nn.Linear(config.text_config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.get_input_embeddings()
+@auto_docstring(
+    custom_intro="""
+Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
+It uses the generic sequence classification implementation for efficiency and consistency."""
+)
+class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
+    config: Gemma3TextConfig
+    input_modalities = ("text",)
 
-    def set_input_embeddings(self, value):
-        self.model.set_input_embeddings(value)
 
-    @can_return_tuple
-    @auto_docstring
+class Gemma3ForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
     def forward(
         self,
         input_ids: torch.LongTensor | None = None,
@@ -1151,78 +1141,22 @@ def forward(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
         token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
-        use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> SequenceClassifierOutputWithPast:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-
-        transformer_outputs = self.model(
-            input_ids,
+        return super().forward(
+            input_ids=input_ids,
             attention_mask=attention_mask,
-            pixel_values=pixel_values,
             position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
             token_type_ids=token_type_ids,
-            use_cache=use_cache,
-            return_dict=True,
+            labels=labels,
             **kwargs,
         )
-        hidden_states = transformer_outputs.last_hidden_state
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.text_config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.text_config.pad_token_id is None:
-            last_non_pad_token = -1
-        elif input_ids is not None:
-            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
-            non_pad_mask = (input_ids != self.config.text_config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
-            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
-        else:
-            last_non_pad_token = -1
-            logger.warning_once(
-                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
-            )
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
-
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
-    """
-    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
-    It uses the generic sequence classification implementation for efficiency and consistency.
-    """
-
-    config: Gemma3TextConfig
-    input_modalities = ("text",)
 
 
 __all__ = [
diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py
index 9de1d8172513..6d965e9f6890 100644
--- a/src/transformers/models/gemma3/modular_gemma3.py
+++ b/src/transformers/models/gemma3/modular_gemma3.py
@@ -889,24 +889,17 @@ def prepare_inputs_for_generation(
         return model_inputs
 
 
-class Gemma3ForSequenceClassification(Gemma3PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Gemma3Model(config)
-        self.score = nn.Linear(config.text_config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.get_input_embeddings()
+@auto_docstring(
+    custom_intro="""
+Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
+It uses the generic sequence classification implementation for efficiency and consistency."""
+)
+class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
+    config: Gemma3TextConfig
+    input_modalities = ("text",)
 
-    def set_input_embeddings(self, value):
-        self.model.set_input_embeddings(value)
 
-    @can_return_tuple
-    @auto_docstring
+class Gemma3ForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
     def forward(
         self,
         input_ids: torch.LongTensor | None = None,
@@ -914,78 +907,22 @@ def forward(
         attention_mask: torch.Tensor | None = None,
         position_ids: torch.LongTensor | None = None,
         past_key_values: Cache | None = None,
-        inputs_embeds: torch.FloatTensor | None = None,
         token_type_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         labels: torch.LongTensor | None = None,
-        use_cache: bool | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> SequenceClassifierOutputWithPast:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-
-        transformer_outputs = self.model(
-            input_ids,
+        return super().forward(
+            input_ids=input_ids,
             attention_mask=attention_mask,
-            pixel_values=pixel_values,
             position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
             token_type_ids=token_type_ids,
-            use_cache=use_cache,
-            return_dict=True,
+            labels=labels,
             **kwargs,
         )
-        hidden_states = transformer_outputs.last_hidden_state
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.text_config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.text_config.pad_token_id is None:
-            last_non_pad_token = -1
-        elif input_ids is not None:
-            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
-            non_pad_mask = (input_ids != self.config.text_config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
-            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
-        else:
-            last_non_pad_token = -1
-            logger.warning_once(
-                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
-            )
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
-
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-class Gemma3TextForSequenceClassification(GenericForSequenceClassification, Gemma3PreTrainedModel):
-    """
-    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
-    It uses the generic sequence classification implementation for efficiency and consistency.
-    """
-
-    config: Gemma3TextConfig
-    input_modalities = ("text",)
 
 
 __all__ = [
diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py
index 048fe1adfa66..f24b23db4f55 100644
--- a/src/transformers/models/gemma3/processing_gemma3.py
+++ b/src/transformers/models/gemma3/processing_gemma3.py
@@ -19,9 +19,11 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, to_py_obj
+from .image_processing_gemma3 import Gemma3ImageProcessorKwargs
 
 
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Gemma3ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py
index e61c5f0038e7..60be54dacc18 100644
--- a/src/transformers/models/gemma3n/configuration_gemma3n.py
+++ b/src/transformers/models/gemma3n/configuration_gemma3n.py
@@ -394,7 +394,7 @@ def to_dict(self) -> dict[str, Any]:
 @strict
 class Gemma3nConfig(PreTrainedConfig):
     r"""
-    audio_soft_tokens_per_image (`int`, *optional*, defaults to 188):
+    audio_soft_tokens_per_audio (`int`, *optional*, defaults to 188):
         The number of soft tokens per audio clip.
     vision_soft_tokens_per_image (`int`, *optional*, defaults to 256):
         The number of soft tokens per image.
@@ -441,7 +441,7 @@ class Gemma3nConfig(PreTrainedConfig):
     text_config: Gemma3nTextConfig | dict[str, Any] | None = None
     vision_config: Gemma3nVisionConfig | dict[str, Any] | None = None
     audio_config: Gemma3nAudioConfig | dict[str, Any] | None = None
-    audio_soft_tokens_per_image: int | None = 188
+    audio_soft_tokens_per_audio: int | None = 188
     vision_soft_tokens_per_image: int | None = 256
     boi_token_id: int | None = 255_999
     eoi_token_id: int | None = 262_144
diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py
index 8d1c5348d378..039c8c4e84c9 100644
--- a/src/transformers/models/gemma3n/modeling_gemma3n.py
+++ b/src/transformers/models/gemma3n/modeling_gemma3n.py
@@ -1485,7 +1485,7 @@ def forward(
 
         Returns:
             audio_encodings: a torch.Tensor of shape
-                `[batch_size, self.config.audio_soft_tokens_per_image,
+                `[batch_size, self.config.audio_soft_tokens_per_audio,
                 self.config.audio_config.hidden_size]`
             audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
         """
@@ -2040,18 +2040,18 @@ def get_placeholder_mask(
             special_audio_mask = input_ids == self.config.audio_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0] * image_features.shape[1]}",
             )
 
         n_audio_tokens = special_audio_mask.sum()
-        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_audio_mask = special_audio_mask.unsqueeze(-1).to(inputs_embeds.device)
         if audio_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+                n_audio_tokens * inputs_embeds.shape[-1] == audio_features.numel(),
                 f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {audio_features.shape[0] * audio_features.shape[1]}",
             )
 
@@ -2124,7 +2124,7 @@ def forward(
             vision_input_ids = torch.where(vision_mask, input_ids, dummy_vision_token_id).to(inputs_embeds.device)
             vision_embeds = self.embed_vision(input_ids=vision_input_ids)
             vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-            expanded_vision_mask = vision_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            expanded_vision_mask = vision_mask.unsqueeze(-1)
             inputs_embeds = torch.where(expanded_vision_mask, vision_embeds, inputs_embeds)
 
             # Handle audio tokens (>= embed_audio.vocab_offset)
@@ -2133,7 +2133,7 @@ def forward(
             audio_input_ids = torch.where(audio_mask, input_ids, dummy_audio_token_id).to(inputs_embeds.device)
             audio_embeds = self.embed_audio(input_ids=audio_input_ids)
             audio_embeds = audio_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-            expanded_audio_mask = audio_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            expanded_audio_mask = audio_mask.unsqueeze(-1)
             inputs_embeds = torch.where(expanded_audio_mask, audio_embeds, inputs_embeds)
         else:
             per_layer_inputs = None
@@ -2163,7 +2163,7 @@ def forward(
             audio_features = torch.where(audio_mask.unsqueeze(-1), audio_padding_embs, audio_features)
 
             audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
-            extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len
+            extra_padding_tokens = self.config.audio_soft_tokens_per_audio - audio_seq_len
             extra_padding_features = audio_padding_embs.expand(audio_batch_size, extra_padding_tokens, audio_embed_dim)
 
             audio_features = torch.cat((audio_features, extra_padding_features), dim=1)
diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py
index e97e1ef4c6d2..181605040330 100644
--- a/src/transformers/models/gemma3n/modular_gemma3n.py
+++ b/src/transformers/models/gemma3n/modular_gemma3n.py
@@ -357,7 +357,7 @@ class Gemma3nVisionConfig(TimmWrapperConfig):
 @strict
 class Gemma3nConfig(PreTrainedConfig):
     r"""
-    audio_soft_tokens_per_image (`int`, *optional*, defaults to 188):
+    audio_soft_tokens_per_audio (`int`, *optional*, defaults to 188):
         The number of soft tokens per audio clip.
     vision_soft_tokens_per_image (`int`, *optional*, defaults to 256):
         The number of soft tokens per image.
@@ -404,7 +404,7 @@ class Gemma3nConfig(PreTrainedConfig):
     text_config: Gemma3nTextConfig | dict[str, Any] | None = None
     vision_config: Gemma3nVisionConfig | dict[str, Any] | None = None
     audio_config: Gemma3nAudioConfig | dict[str, Any] | None = None
-    audio_soft_tokens_per_image: int | None = 188
+    audio_soft_tokens_per_audio: int | None = 188
     vision_soft_tokens_per_image: int | None = 256
     boi_token_id: int | None = 255_999
     eoi_token_id: int | None = 262_144
@@ -1764,7 +1764,7 @@ def forward(
 
         Returns:
             audio_encodings: a torch.Tensor of shape
-                `[batch_size, self.config.audio_soft_tokens_per_image,
+                `[batch_size, self.config.audio_soft_tokens_per_audio,
                 self.config.audio_config.hidden_size]`
             audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
         """
@@ -2149,18 +2149,18 @@ def get_placeholder_mask(
             special_audio_mask = input_ids == self.config.audio_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0] * image_features.shape[1]}",
             )
 
         n_audio_tokens = special_audio_mask.sum()
-        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_audio_mask = special_audio_mask.unsqueeze(-1).to(inputs_embeds.device)
         if audio_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_audio_mask].numel() == audio_features.numel(),
+                n_audio_tokens * inputs_embeds.shape[-1] == audio_features.numel(),
                 f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {audio_features.shape[0] * audio_features.shape[1]}",
             )
 
@@ -2233,7 +2233,7 @@ def forward(
             vision_input_ids = torch.where(vision_mask, input_ids, dummy_vision_token_id).to(inputs_embeds.device)
             vision_embeds = self.embed_vision(input_ids=vision_input_ids)
             vision_embeds = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-            expanded_vision_mask = vision_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            expanded_vision_mask = vision_mask.unsqueeze(-1)
             inputs_embeds = torch.where(expanded_vision_mask, vision_embeds, inputs_embeds)
 
             # Handle audio tokens (>= embed_audio.vocab_offset)
@@ -2242,7 +2242,7 @@ def forward(
             audio_input_ids = torch.where(audio_mask, input_ids, dummy_audio_token_id).to(inputs_embeds.device)
             audio_embeds = self.embed_audio(input_ids=audio_input_ids)
             audio_embeds = audio_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-            expanded_audio_mask = audio_mask.unsqueeze(-1).expand_as(inputs_embeds)
+            expanded_audio_mask = audio_mask.unsqueeze(-1)
             inputs_embeds = torch.where(expanded_audio_mask, audio_embeds, inputs_embeds)
         else:
             per_layer_inputs = None
@@ -2272,7 +2272,7 @@ def forward(
             audio_features = torch.where(audio_mask.unsqueeze(-1), audio_padding_embs, audio_features)
 
             audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
-            extra_padding_tokens = self.config.audio_soft_tokens_per_image - audio_seq_len
+            extra_padding_tokens = self.config.audio_soft_tokens_per_audio - audio_seq_len
             extra_padding_features = audio_padding_embs.expand(audio_batch_size, extra_padding_tokens, audio_embed_dim)
 
             audio_features = torch.cat((audio_features, extra_padding_features), dim=1)
diff --git a/src/transformers/models/gemma4/modeling_gemma4.py b/src/transformers/models/gemma4/modeling_gemma4.py
index cdc4a6daeafc..d2acb10afae5 100644
--- a/src/transformers/models/gemma4/modeling_gemma4.py
+++ b/src/transformers/models/gemma4/modeling_gemma4.py
@@ -1442,7 +1442,7 @@ class Gemma4PreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["Gemma4TextDecoderLayer", "Gemma4VisionEncoderLayer", "Gemma4AudioLayer"]
     _skip_keys_device_placement = ["past_key_values", "shared_kv_states"]
-    _supports_flash_attn = True
+    _supports_flash_attn = False  # released checkpoints use head_dim=512, which is not supported yet by FA kernels
     _supports_sdpa = True
     _supports_flex_attn = True
 
@@ -1941,7 +1941,8 @@ def forward(
                 (self.config.attention_context_left - 1, self.config.attention_context_right)
             ),
         )
-        attention_mask = self._convert_4d_mask_to_blocked_5d(attention_mask)
+        if attention_mask is not None:
+            attention_mask = self._convert_4d_mask_to_blocked_5d(attention_mask)
 
         for encoder_layer in self.layers[: self.config.num_hidden_layers]:
             hidden_states = encoder_layer(
diff --git a/src/transformers/models/gemma4/modular_gemma4.py b/src/transformers/models/gemma4/modular_gemma4.py
index 739870f2a177..2bce7a9200c6 100644
--- a/src/transformers/models/gemma4/modular_gemma4.py
+++ b/src/transformers/models/gemma4/modular_gemma4.py
@@ -1159,6 +1159,7 @@ class Gemma4TextScaledWordEmbedding(Gemma3TextScaledWordEmbedding):
 class Gemma4PreTrainedModel(Gemma3nPreTrainedModel):
     _no_split_modules = ["Gemma4TextDecoderLayer", "Gemma4VisionEncoderLayer", "Gemma4AudioLayer"]
     input_modalities = ("image", "text", "video", "audio")
+    _supports_flash_attn = False  # released checkpoints use head_dim=512, which is not supported yet by FA kernels
     _can_record_outputs = None  # override
 
     @torch.no_grad()
@@ -1511,7 +1512,8 @@ def forward(
                 (self.config.attention_context_left - 1, self.config.attention_context_right)
             ),
         )
-        attention_mask = self._convert_4d_mask_to_blocked_5d(attention_mask)
+        if attention_mask is not None:
+            attention_mask = self._convert_4d_mask_to_blocked_5d(attention_mask)
 
         for encoder_layer in self.layers[: self.config.num_hidden_layers]:
             hidden_states = encoder_layer(
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 9be97d01c425..8cfe34aaab49 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -679,7 +679,7 @@ def __init__(self, config: GitVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = GitVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = GitVisionEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
@@ -694,7 +694,7 @@ def forward(
             raise ValueError("You have to specify pixel_values")
 
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 712202580943..186cbcc238e1 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -121,7 +121,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py
index e99930ae57f6..64c349c1a5bc 100644
--- a/src/transformers/models/glm4/modeling_glm4.py
+++ b/src/transformers/models/glm4/modeling_glm4.py
@@ -319,7 +319,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/glm46v/modeling_glm46v.py b/src/transformers/models/glm46v/modeling_glm46v.py
index 81207e4c8608..9b7e01ec3d93 100644
--- a/src/transformers/models/glm46v/modeling_glm46v.py
+++ b/src/transformers/models/glm46v/modeling_glm46v.py
@@ -333,18 +333,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/glm46v/processing_glm46v.py b/src/transformers/models/glm46v/processing_glm46v.py
index 9dcf7c4856e6..6c5b561a69b6 100644
--- a/src/transformers/models/glm46v/processing_glm46v.py
+++ b/src/transformers/models/glm46v/processing_glm46v.py
@@ -27,12 +27,14 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ...video_utils import VideoInput
+from .image_processing_glm46v import Glm46VImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class Glm46VProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Glm46VImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/glm4_moe/modeling_glm4_moe.py b/src/transformers/models/glm4_moe/modeling_glm4_moe.py
index cc5a564ab86f..3a61d135f417 100644
--- a/src/transformers/models/glm4_moe/modeling_glm4_moe.py
+++ b/src/transformers/models/glm4_moe/modeling_glm4_moe.py
@@ -102,7 +102,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py b/src/transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py
index 0b8ccc865775..153bad424033 100644
--- a/src/transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py
+++ b/src/transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py
@@ -249,7 +249,7 @@ def __init__(self, config: Glm4MoeLiteConfig, layer_idx: int):
             self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
         else:
             self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
-            self.q_a_layernorm = Glm4MoeLiteRMSNorm(config.q_lora_rank)
+            self.q_a_layernorm = Glm4MoeLiteRMSNorm(config.q_lora_rank, eps=config.rms_norm_eps)
             self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
 
         self.kv_a_proj_with_mqa = nn.Linear(
@@ -257,7 +257,7 @@ def __init__(self, config: Glm4MoeLiteConfig, layer_idx: int):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=config.attention_bias,
         )
-        self.kv_a_layernorm = Glm4MoeLiteRMSNorm(self.kv_lora_rank)
+        self.kv_a_layernorm = Glm4MoeLiteRMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = nn.Linear(
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py
index 6121dc8d3fe8..84d2be8810ac 100644
--- a/src/transformers/models/glm4v/modeling_glm4v.py
+++ b/src/transformers/models/glm4v/modeling_glm4v.py
@@ -305,7 +305,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1176,18 +1176,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py
index d4a34a1952ad..d1878f19644f 100644
--- a/src/transformers/models/glm4v/modular_glm4v.py
+++ b/src/transformers/models/glm4v/modular_glm4v.py
@@ -861,18 +861,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py
index 2d3e93aec9ed..cfd3b445d683 100644
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -26,12 +26,14 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ...video_utils import VideoInput
+from .image_processing_glm4v import Glm4vImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Glm4vImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
index 3bf3dc157d3f..db8f7fdbb447 100644
--- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
+++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py
@@ -675,7 +675,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1345,18 +1345,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
@@ -1515,7 +1515,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -1523,7 +1523,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -1540,8 +1542,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/glm_image/modeling_glm_image.py b/src/transformers/models/glm_image/modeling_glm_image.py
index 012da8513453..967419aa21ad 100644
--- a/src/transformers/models/glm_image/modeling_glm_image.py
+++ b/src/transformers/models/glm_image/modeling_glm_image.py
@@ -129,7 +129,7 @@ def forward(
 
         if "flash" in self.config._attn_implementation:
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
diff --git a/src/transformers/models/glm_image/modular_glm_image.py b/src/transformers/models/glm_image/modular_glm_image.py
index e72aede3da66..4a1dd37b1b90 100644
--- a/src/transformers/models/glm_image/modular_glm_image.py
+++ b/src/transformers/models/glm_image/modular_glm_image.py
@@ -226,7 +226,7 @@ def forward(
 
         if "flash" in self.config._attn_implementation:
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
diff --git a/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py b/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py
index 8f11f42794b3..9d7175de8583 100644
--- a/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py
+++ b/src/transformers/models/glm_moe_dsa/configuration_glm_moe_dsa.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from huggingface_hub.dataclasses import strict
 
 from ...configuration_utils import PreTrainedConfig
@@ -116,6 +117,7 @@ class GlmMoeDsaConfig(PreTrainedConfig):
     mlp_layer_types: list[str] | None = None
     attention_bias: bool = False
     attention_dropout: float | int = 0.0
+    num_experts: int = 256
     index_topk: int = 2048
     index_head_dim: int = 128
     index_n_heads: int = 32
diff --git a/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py b/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py
index 736dcdce32c3..ccf67726d089 100644
--- a/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py
+++ b/src/transformers/models/glm_moe_dsa/modeling_glm_moe_dsa.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from collections.abc import Callable
 from typing import Optional
 
@@ -30,6 +31,7 @@
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
 from ...integrations import use_experts_implementation, use_kernel_forward_from_hub
+from ...integrations.dsa_kernels import act_quant, fp8_index
 from ...masking_utils import create_causal_mask
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
@@ -64,11 +66,12 @@ def extra_repr(self):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 
 
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
+def rotate_activation(x: torch.Tensor) -> torch.Tensor:
+    assert x.dtype == torch.bfloat16
+    from fast_hadamard_transform import hadamard_transform
+
+    hidden_size = x.size(-1)
+    return hadamard_transform(x, scale=hidden_size**-0.5)
 
 
 def apply_rotary_pos_emb(
@@ -93,13 +96,14 @@ def apply_rotary_pos_emb(
     Returns:
         `torch.Tensor`: Tensor with rotary embeddings applied, same shape as input.
     """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-
-    # Split-half (NeoX/Llama style): (x[:d/2], x[d/2:])
-    # This matches llama's apply_rotary_pos_emb logic.
-    x_rotated = (x * cos) + (rotate_half(x) * sin)
-    return x_rotated
+    # Interleaved (GPT-J style): (x[0], x[1]), (x[2], x[3]), ...
+    # RotaryEmbedding outputs cos/sin with repeated halves for NeoX compatibility,
+    # while interleaved rotation expects [.., D/2] frequencies.
+    cos = cos[..., : x.shape[-1] // 2].unsqueeze(unsqueeze_dim)
+    sin = sin[..., : x.shape[-1] // 2].unsqueeze(unsqueeze_dim)
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    return torch.stack((x1 * cos - x2 * sin, x2 * cos + x1 * sin), dim=-1).flatten(-2)
 
 
 class GlmMoeDsaIndexer(nn.Module):
@@ -107,8 +111,7 @@ class GlmMoeDsaIndexer(nn.Module):
     DeepSeek Sparse Attention (DSA) indexer for selecting top-k tokens.
 
     The Indexer has its own lightweight projections (wq_b, wk) separate from the
-    main MLA attention. It uses non-interleaved (NeoX/Llama) RoPE, unlike the main attention
-    which uses interleaved RoPE.
+    main MLA attention.
 
     **Cache strategy**: The Indexer manages its own key cache (`_cached_keys`) separately
     from the DynamicCache used by MLA attention, since DynamicCache is sized for exactly
@@ -137,9 +140,12 @@ def __init__(self, config: "GlmMoeDsaConfig", layer_idx: int):
         # Keeping it as a plain Linear prevents FP8 conversion (see `_keep_in_fp32_modules`).
         self.weights_proj = nn.Linear(self.hidden_size, self.n_heads, bias=False)
         self.softmax_scale = self.head_dim**-0.5
+        self.scale_fmt = "ue8m0"
+        self.quant_block_size = 128
 
         # Indexer maintains its own key cache (not in DynamicCache, which is sized for attention layers only)
         self.register_buffer("_cached_keys", None, persistent=False)
+        self.register_buffer("_cached_keys_scales", None, persistent=False)
 
     @torch.no_grad()
     def forward(
@@ -187,19 +193,29 @@ def forward(
         k_pe = apply_rotary_pos_emb(k_pe.unsqueeze(2), cos, sin, unsqueeze_dim=2).squeeze(2)  # [B, S, rope_D]
         k = torch.cat([k_pe, k_nope], dim=-1)  # [B, S, D]
 
+        q = rotate_activation(q)  # [B, S, H, D]
+        k = rotate_activation(k)  # [B, S, D]
+        q_fp8, q_scale = act_quant(q, self.quant_block_size, self.scale_fmt)
+        k_fp8, k_scale = act_quant(k, self.quant_block_size, self.scale_fmt)
+
         # === Key cache (managed by the indexer, not DynamicCache) ===
         # Reset cache on prefill (new prompt) to avoid stale keys / batch-size mismatch
         if seq_len > 1:
             self._cached_keys = None
+            self._cached_keys_scales = None
 
         if use_cache:
             if self._cached_keys is not None:
-                k_cached = torch.cat([self._cached_keys, k], dim=1)  # [B, T, D]
+                k_cached = torch.cat([self._cached_keys, k_fp8], dim=1)  # [B, T, D]
+                k_scale_cached = torch.cat([self._cached_keys_scales, k_scale.squeeze(-1)], dim=1)  # [B, T]
             else:
-                k_cached = k
+                k_cached = k_fp8
+                k_scale_cached = k_scale.squeeze(-1)
             self._cached_keys = k_cached
+            self._cached_keys_scales = k_scale_cached
         else:
-            k_cached = k
+            k_cached = k_fp8
+            k_scale_cached = k_scale.squeeze(-1)
 
         # === Scoring ===
         # Reference: weights = weights_proj(x.float()) * n_heads^(-0.5)
@@ -213,19 +229,17 @@ def forward(
         # Don't force fp32 inputs here: the checkpoint stores `weights_proj.weight` in bf16.
         # Use native dtype for matmul, then upcast the result for scoring stability.
         weights = self.weights_proj(hidden_states).float() * (self.n_heads**-0.5)  # [B, S, H]
+        weights = weights * q_scale.squeeze(-1) * self.softmax_scale  # [B, S, H]
 
-        # q·k^T per head: [B, S, H, D] @ [B, T, D]^T → [B, S, H, T]
-        scores = torch.einsum("bshd,btd->bsht", q.float(), k_cached.float()) * self.softmax_scale
-        scores = F.relu(scores)
-        # Weight per head and sum across heads → [B, S, T]
-        index_scores = torch.einsum("bsht,bsh->bst", scores, weights)
+        index_score = fp8_index(
+            q_fp8.contiguous(), weights.contiguous(), k_cached.contiguous(), k_scale_cached.contiguous()
+        )  # [B, S, T]
 
         if attention_mask is not None:
-            index_scores = index_scores + attention_mask
+            index_score = index_score + attention_mask
 
-        total_len = index_scores.shape[-1]
-        topk = min(self.index_topk, total_len)
-        topk_indices = index_scores.topk(topk, dim=-1).indices  # [B, S, topk]
+        actual_topk = min(self.index_topk, index_score.shape[-1])
+        topk_indices = index_score.topk(actual_topk, dim=-1)[1]  # [B, S, actual_topk]
         return topk_indices
 
 
diff --git a/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py b/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py
index 2e7e91200d8b..ab0026c52818 100644
--- a/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py
+++ b/src/transformers/models/glm_moe_dsa/modular_glm_moe_dsa.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 from collections.abc import Callable
 
 import torch
@@ -21,11 +22,11 @@
 
 from ...cache_utils import Cache, DynamicCache
 from ...configuration_utils import PreTrainedConfig
+from ...integrations.dsa_kernels import act_quant, fp8_index
 from ...masking_utils import create_causal_mask
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutputWithPast
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
-from ...models.llama.modeling_llama import rotate_half
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, logging
 from ...utils.generic import is_flash_attention_requested
@@ -45,6 +46,14 @@
 logger = logging.get_logger(__name__)
 
 
+def rotate_activation(x: torch.Tensor) -> torch.Tensor:
+    assert x.dtype == torch.bfloat16
+    from fast_hadamard_transform import hadamard_transform
+
+    hidden_size = x.size(-1)
+    return hadamard_transform(x, scale=hidden_size**-0.5)
+
+
 def apply_rotary_pos_emb(
     x: torch.Tensor,
     cos: torch.Tensor,
@@ -67,13 +76,14 @@ def apply_rotary_pos_emb(
     Returns:
         `torch.Tensor`: Tensor with rotary embeddings applied, same shape as input.
     """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-
-    # Split-half (NeoX/Llama style): (x[:d/2], x[d/2:])
-    # This matches llama's apply_rotary_pos_emb logic.
-    x_rotated = (x * cos) + (rotate_half(x) * sin)
-    return x_rotated
+    # Interleaved (GPT-J style): (x[0], x[1]), (x[2], x[3]), ...
+    # RotaryEmbedding outputs cos/sin with repeated halves for NeoX compatibility,
+    # while interleaved rotation expects [.., D/2] frequencies.
+    cos = cos[..., : x.shape[-1] // 2].unsqueeze(unsqueeze_dim)
+    sin = sin[..., : x.shape[-1] // 2].unsqueeze(unsqueeze_dim)
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    return torch.stack((x1 * cos - x2 * sin, x2 * cos + x1 * sin), dim=-1).flatten(-2)
 
 
 @auto_docstring(checkpoint="zai-org/GLM-5")
@@ -128,6 +138,7 @@ class GlmMoeDsaConfig(Glm4MoeLiteConfig):
     num_hidden_layers: int = 78
     num_attention_heads: int = 64
     num_key_value_heads: int = 64
+    num_experts: int = 256
     n_routed_experts: int = 256
     routed_scaling_factor: float = 2.5
     q_lora_rank: int = 2048
@@ -173,8 +184,7 @@ class GlmMoeDsaIndexer(nn.Module):
     DeepSeek Sparse Attention (DSA) indexer for selecting top-k tokens.
 
     The Indexer has its own lightweight projections (wq_b, wk) separate from the
-    main MLA attention. It uses non-interleaved (NeoX/Llama) RoPE, unlike the main attention
-    which uses interleaved RoPE.
+    main MLA attention.
 
     **Cache strategy**: The Indexer manages its own key cache (`_cached_keys`) separately
     from the DynamicCache used by MLA attention, since DynamicCache is sized for exactly
@@ -203,9 +213,12 @@ def __init__(self, config: "GlmMoeDsaConfig", layer_idx: int):
         # Keeping it as a plain Linear prevents FP8 conversion (see `_keep_in_fp32_modules`).
         self.weights_proj = nn.Linear(self.hidden_size, self.n_heads, bias=False)
         self.softmax_scale = self.head_dim**-0.5
+        self.scale_fmt = "ue8m0"
+        self.quant_block_size = 128
 
         # Indexer maintains its own key cache (not in DynamicCache, which is sized for attention layers only)
         self.register_buffer("_cached_keys", None, persistent=False)
+        self.register_buffer("_cached_keys_scales", None, persistent=False)
 
     @torch.no_grad()
     def forward(
@@ -253,19 +266,29 @@ def forward(
         k_pe = apply_rotary_pos_emb(k_pe.unsqueeze(2), cos, sin, unsqueeze_dim=2).squeeze(2)  # [B, S, rope_D]
         k = torch.cat([k_pe, k_nope], dim=-1)  # [B, S, D]
 
+        q = rotate_activation(q)  # [B, S, H, D]
+        k = rotate_activation(k)  # [B, S, D]
+        q_fp8, q_scale = act_quant(q, self.quant_block_size, self.scale_fmt)
+        k_fp8, k_scale = act_quant(k, self.quant_block_size, self.scale_fmt)
+
         # === Key cache (managed by the indexer, not DynamicCache) ===
         # Reset cache on prefill (new prompt) to avoid stale keys / batch-size mismatch
         if seq_len > 1:
             self._cached_keys = None
+            self._cached_keys_scales = None
 
         if use_cache:
             if self._cached_keys is not None:
-                k_cached = torch.cat([self._cached_keys, k], dim=1)  # [B, T, D]
+                k_cached = torch.cat([self._cached_keys, k_fp8], dim=1)  # [B, T, D]
+                k_scale_cached = torch.cat([self._cached_keys_scales, k_scale.squeeze(-1)], dim=1)  # [B, T]
             else:
-                k_cached = k
+                k_cached = k_fp8
+                k_scale_cached = k_scale.squeeze(-1)
             self._cached_keys = k_cached
+            self._cached_keys_scales = k_scale_cached
         else:
-            k_cached = k
+            k_cached = k_fp8
+            k_scale_cached = k_scale.squeeze(-1)
 
         # === Scoring ===
         # Reference: weights = weights_proj(x.float()) * n_heads^(-0.5)
@@ -279,19 +302,17 @@ def forward(
         # Don't force fp32 inputs here: the checkpoint stores `weights_proj.weight` in bf16.
         # Use native dtype for matmul, then upcast the result for scoring stability.
         weights = self.weights_proj(hidden_states).float() * (self.n_heads**-0.5)  # [B, S, H]
+        weights = weights * q_scale.squeeze(-1) * self.softmax_scale  # [B, S, H]
 
-        # q·k^T per head: [B, S, H, D] @ [B, T, D]^T → [B, S, H, T]
-        scores = torch.einsum("bshd,btd->bsht", q.float(), k_cached.float()) * self.softmax_scale
-        scores = F.relu(scores)
-        # Weight per head and sum across heads → [B, S, T]
-        index_scores = torch.einsum("bsht,bsh->bst", scores, weights)
+        index_score = fp8_index(
+            q_fp8.contiguous(), weights.contiguous(), k_cached.contiguous(), k_scale_cached.contiguous()
+        )  # [B, S, T]
 
         if attention_mask is not None:
-            index_scores = index_scores + attention_mask
+            index_score = index_score + attention_mask
 
-        total_len = index_scores.shape[-1]
-        topk = min(self.index_topk, total_len)
-        topk_indices = index_scores.topk(topk, dim=-1).indices  # [B, S, topk]
+        actual_topk = min(self.index_topk, index_score.shape[-1])
+        topk_indices = index_score.topk(actual_topk, dim=-1)[1]  # [B, S, actual_topk]
         return topk_indices
 
 
diff --git a/src/transformers/models/glm_ocr/modeling_glm_ocr.py b/src/transformers/models/glm_ocr/modeling_glm_ocr.py
index 828a99a705b5..bec157674bad 100644
--- a/src/transformers/models/glm_ocr/modeling_glm_ocr.py
+++ b/src/transformers/models/glm_ocr/modeling_glm_ocr.py
@@ -429,7 +429,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1092,18 +1092,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/glm_ocr/modular_glm_ocr.py b/src/transformers/models/glm_ocr/modular_glm_ocr.py
index 2f71dded711d..cbd89201179a 100644
--- a/src/transformers/models/glm_ocr/modular_glm_ocr.py
+++ b/src/transformers/models/glm_ocr/modular_glm_ocr.py
@@ -182,7 +182,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
diff --git a/src/transformers/models/got_ocr2/modeling_got_ocr2.py b/src/transformers/models/got_ocr2/modeling_got_ocr2.py
index ab072a8b1f5f..2eaad185933c 100644
--- a/src/transformers/models/got_ocr2/modeling_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/modeling_got_ocr2.py
@@ -579,9 +579,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index 709e6ca86a48..13323ab3d83c 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -81,7 +81,7 @@ class GPT2Config(PreTrainedConfig):
     n_layer: int = 12
     n_head: int = 12
     n_inner: int | None = None
-    activation_function: str = "gelu_new"
+    activation_function: str = "gelu"
     resid_pdrop: float | int = 0.1
     embd_pdrop: float | int = 0.1
     attn_pdrop: float | int = 0.1
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 10e4b5922add..d227d71120a8 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -108,7 +108,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index e334ce023d67..d92020b0152b 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -111,7 +111,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py
index 47c029a5bca9..66c993a94fdf 100644
--- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py
+++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py
@@ -23,9 +23,6 @@
 @strict
 class GptOssConfig(PreTrainedConfig):
     model_type = "gpt_oss"
-    attribute_map = {
-        "num_experts": "num_local_experts",
-    }
     default_theta = 150000.0
     base_model_pp_plan = {
         "embed_tokens": (["input_ids"], ["inputs_embeds"]),
diff --git a/src/transformers/models/gpt_oss/modeling_gpt_oss.py b/src/transformers/models/gpt_oss/modeling_gpt_oss.py
index 55381a7e3c21..d0191d373238 100644
--- a/src/transformers/models/gpt_oss/modeling_gpt_oss.py
+++ b/src/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -475,6 +475,7 @@ def forward(
                 "inputs_embeds": inputs_embeds,
                 "attention_mask": attention_mask,
                 "past_key_values": past_key_values,
+                "position_ids": position_ids,
             }
             causal_mask_mapping = {
                 "full_attention": create_causal_mask(**mask_kwargs),
@@ -537,7 +538,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -545,13 +546,17 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
-        batch_size, sequence_length = attention_mask.shape
-        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        num_hidden_layers = len(gate_logits)
+        batch_size = attention_mask.shape[0]
+        sequence_length = gate_logits[0].shape[0] // batch_size
+        attention_mask = attention_mask[:, -sequence_length:]
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
         expert_attention_mask = (
@@ -562,8 +567,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index 934345fe6723..25927348cd9a 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -356,7 +356,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py
index 5fb53d6afe49..40277ef1a64d 100644
--- a/src/transformers/models/granitemoe/modeling_granitemoe.py
+++ b/src/transformers/models/granitemoe/modeling_granitemoe.py
@@ -121,7 +121,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -577,7 +577,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -585,7 +585,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -602,8 +604,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
index 2e0926f3e5d4..fdabd685fe9a 100644
--- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
+++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py
@@ -833,7 +833,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -1258,7 +1258,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -1266,7 +1266,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -1283,8 +1285,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py
index 741c58e005f8..36a87cc00ed4 100644
--- a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py
+++ b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py
@@ -23,7 +23,7 @@
 from ...modeling_outputs import BaseModelOutputWithPast, MoeModelOutputWithPast
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling, logging
 from ...utils.generic import merge_with_config_defaults
 from ...utils.output_capturing import capture_outputs
 from ..bamba.configuration_bamba import BambaConfig
@@ -275,8 +275,9 @@ def _update_mamba_mask(self, attention_mask, past_key_values):
             2. Attending to all inputs
         """
         mamba_mask = attention_mask
-        if (past_key_values is not None and past_key_values.has_previous_state()) or (
-            attention_mask is not None and torch.all(attention_mask == 1)
+        if not is_torchdynamo_compiling() and (
+            (past_key_values is not None and past_key_values.has_previous_state())
+            or (attention_mask is not None and torch.all(attention_mask == 1))
         ):
             mamba_mask = None
         return mamba_mask
diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
index 71f8c6eaff7d..2f9533f92e45 100644
--- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
+++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py
@@ -524,7 +524,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -646,7 +646,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -654,7 +654,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -671,8 +673,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 7835885fd42d..4d6f0201cc7d 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -30,6 +30,7 @@
 
 if TYPE_CHECKING:
     from .modeling_grounding_dino import GroundingDinoObjectDetectionOutput
+from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs
 
 
 AnnotationType = dict[str, int | str | list[dict]]
@@ -98,6 +99,7 @@ def get(self, key, *args, **kwargs):
 
 
 class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: GroundingDinoImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py
index 8283fcb19e28..653867d7c5bd 100644
--- a/src/transformers/models/helium/modeling_helium.py
+++ b/src/transformers/models/helium/modeling_helium.py
@@ -119,7 +119,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py
index a0f106167721..eec49fca3f07 100644
--- a/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py
+++ b/src/transformers/models/higgs_audio_v2/modeling_higgs_audio_v2.py
@@ -524,7 +524,7 @@ def forward(
                 else audio_embeds
             )
             inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask[..., None].expand_as(inputs_embeds), audio_embeds.to(inputs_embeds.device)
+                audio_token_mask[..., None], audio_embeds.to(inputs_embeds.device)
             )
         elif audio_input_ids is not None:
             inputs_embeds = audio_embeds
diff --git a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py
index 03d34b0e3444..8c48760a5a17 100644
--- a/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py
+++ b/src/transformers/models/higgs_audio_v2/modular_higgs_audio_v2.py
@@ -326,7 +326,7 @@ def forward(
                 else audio_embeds
             )
             inputs_embeds = inputs_embeds.masked_scatter(
-                audio_token_mask[..., None].expand_as(inputs_embeds), audio_embeds.to(inputs_embeds.device)
+                audio_token_mask[..., None], audio_embeds.to(inputs_embeds.device)
             )
         elif audio_input_ids is not None:
             inputs_embeds = audio_embeds
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index e3934ba80f68..f0c561e93920 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -36,6 +36,7 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel, get_torch_context_manager_or_global_device
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils.generic import _conv_out_length
 from .configuration_hubert import HubertConfig
 
 
@@ -676,11 +677,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py
index 59a72d3269cb..dac73d85ccb2 100644
--- a/src/transformers/models/hubert/modular_hubert.py
+++ b/src/transformers/models/hubert/modular_hubert.py
@@ -22,6 +22,7 @@
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring
+from ...utils.generic import _conv_out_length
 from ..wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Encoder,
     Wav2Vec2EncoderStableLayerNorm,
@@ -174,11 +175,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py
index d1652d78cbbc..1812977963cf 100644
--- a/src/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py
+++ b/src/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py
@@ -376,7 +376,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py
index 19779da0528c..970daefaa2f3 100644
--- a/src/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py
+++ b/src/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py
@@ -465,7 +465,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index 8f4578e1d0f2..687bc71f30a9 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -156,7 +156,9 @@ def expand_inputs_for_generation(
     return input_ids, model_kwargs
 
 
-def freeze_model(model, module_exceptions=()):
+def freeze_model(model, module_exceptions=None):
+    if module_exceptions is None:
+        module_exceptions = []
     mapping = {
         "LayerNorm": nn.LayerNorm,
         "Linear": nn.Linear,
@@ -927,11 +929,15 @@ def freeze_relevant_params(self, config=None):
         if config.freeze_vision_layers:
             freeze_model(self.vision_model, module_exceptions=config.freeze_vision_module_exceptions)
 
-    def freeze_text_layers(self, module_exceptions=()):
+    def freeze_text_layers(self, module_exceptions=None):
+        if module_exceptions is None:
+            module_exceptions = []
         for module in [self.layers, self.norm]:
             freeze_model(module, module_exceptions=module_exceptions)
 
-    def freeze_vision_layers(self, module_exceptions=()):
+    def freeze_vision_layers(self, module_exceptions=None):
+        if module_exceptions is None:
+            module_exceptions = []
         freeze_model(self.vision_model, module_exceptions=module_exceptions)
 
     @merge_with_config_defaults
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index b774d10b35c7..8d099c3bbcdd 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -31,6 +31,7 @@
 
 if is_torch_available():
     import torch
+from .image_processing_idefics import IdeficsImageProcessorKwargs
 
 
 IMAGE_TOKEN = "<image>"
@@ -52,6 +53,7 @@ class IdeficsTextKwargs(TextKwargs, total=False):
 
 
 class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: IdeficsImageProcessorKwargs
     text_kwargs: IdeficsTextKwargs
     _defaults = {
         "text_kwargs": {
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index 5d81439e27b6..770b7d6c9fd5 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -815,7 +815,7 @@ def inputs_merger(
         else:
             special_image_mask = input_ids == self.config.image_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         image_hidden_states = image_hidden_states.to(inputs_embeds.device, inputs_embeds.dtype)
         inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_hidden_states)
         return inputs_embeds
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index dd87290838ff..95a1c41fea03 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -32,6 +32,7 @@
 
 if TYPE_CHECKING:
     from ...tokenization_utils_base import PreTokenizedInput
+from .image_processing_idefics2 import Idefics2ImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
@@ -46,6 +47,7 @@ def is_image_or_image_url(elem):
 
 
 class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Idefics2ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py
index 2c58aba032cd..86a8ac50ce04 100644
--- a/src/transformers/models/idefics3/modeling_idefics3.py
+++ b/src/transformers/models/idefics3/modeling_idefics3.py
@@ -559,7 +559,7 @@ def inputs_merger(
         else:
             special_image_mask = input_ids == self.config.image_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         image_hidden_states = image_hidden_states.to(inputs_embeds.device, inputs_embeds.dtype)
         inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_hidden_states)
         return inputs_embeds
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index f43ac76bf3ff..24d05f958c35 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -30,6 +30,8 @@
 
 if TYPE_CHECKING:
     from ...tokenization_utils_base import PreTokenizedInput
+from .image_processing_idefics3 import Idefics3ImageProcessorKwargs
+
 
 logger = logging.get_logger(__name__)
 
@@ -87,6 +89,7 @@ def get_image_prompt_string(
 
 
 class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Idefics3ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 29f32f17d6c4..1faaa9f536ba 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -998,7 +998,7 @@ def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch
         else:
             special_image_mask = input_ids == self.config.image_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask
 
     @can_return_tuple
@@ -1257,7 +1257,7 @@ def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch
         else:
             special_image_mask = input_ids == self.config.image_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask
 
     @can_return_tuple
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index 06d3d28b2c88..955794db2b0b 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -982,7 +982,7 @@ def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch
         else:
             special_image_mask = input_ids == self.config.image_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask
 
     @can_return_tuple
@@ -1074,7 +1074,7 @@ def forward(
             )
             special_image_mask = special_image_mask.all(-1)
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
         inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
 
@@ -1205,7 +1205,7 @@ def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch
         else:
             special_image_mask = input_ids == self.config.video_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask
 
     @can_return_tuple
diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
index d84f3fd13398..862a812fdeb5 100644
--- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -209,7 +209,7 @@ def forward(
             )
             special_image_mask = special_image_mask.all(-1)
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
         inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
 
@@ -324,7 +324,7 @@ def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch
         else:
             special_image_mask = input_ids == self.config.video_token_id
 
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask
 
     @can_return_tuple
diff --git a/src/transformers/models/internvl/modeling_internvl.py b/src/transformers/models/internvl/modeling_internvl.py
index 284d97406e65..7c61c4eee2b8 100644
--- a/src/transformers/models/internvl/modeling_internvl.py
+++ b/src/transformers/models/internvl/modeling_internvl.py
@@ -609,9 +609,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/internvl/processing_internvl.py b/src/transformers/models/internvl/processing_internvl.py
index 84c611115dcf..36dc8082a4d0 100644
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@@ -21,9 +21,11 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
 from ...video_utils import VideoInput
+from ..got_ocr2.image_processing_got_ocr2 import GotOcr2ImageProcessorKwargs
 
 
 class InternVLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: GotOcr2ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding_side": "left",
@@ -75,7 +77,7 @@ def _insert_media_placeholders(
         video_num_patches: list[int],
         image_num_patches_indices: np.ndarray,
         video_num_patches_indices: np.ndarray,
-        video_patch_indices: np.ndarray,
+        video_frame_indices: np.ndarray,
     ):
         """
         Processes interleaved text with <image> and <video> placeholders, replacing them with appropriate
@@ -109,13 +111,13 @@ def _insert_media_placeholders(
                     # Get the slice of patches corresponding to the current video
                     # Here we need to account for both the multiple video frames and the potential multiple patches per frame
                     # As of now, InternVL only supports one patch per frame, but we keep the code flexible for future updates
-                    current_patch_index = video_patch_indices[video_index]
-                    end_patch_index = video_patch_indices[video_index + 1]
-                    start_index = video_num_patches_indices[current_patch_index]
-                    end_index = video_num_patches_indices[end_patch_index]
+                    start_frame_index = video_frame_indices[video_index - 1] if video_index > 0 else 0
+                    end_frame_index = video_frame_indices[video_index]
+                    start_index = video_num_patches_indices[start_frame_index - 1] if start_frame_index > 0 else 0
+                    end_index = video_num_patches_indices[end_frame_index - 1]
                     image_video_patches.append(video_pixel_values[start_index:end_index])
                     # Get the number of patches per frame and replace the video placeholder with the correct number of image tokens
-                    num_patches = list(video_num_patches[current_patch_index:end_patch_index])
+                    num_patches = list(video_num_patches[start_frame_index:end_frame_index])
                     video_prompt = "\n".join(
                         f"Frame{i + 1}: {self.start_image_token}{self.image_token * self.image_seq_length * num_patches[i]}{self.end_image_token}"
                         for i in range(len(num_patches))
@@ -164,6 +166,8 @@ def __call__(
         image_num_patches = []
         image_pixel_values = None
         image_num_patches_indices = np.array([0])
+        video_frame_indices = np.array([0])
+        video_num_patches_indices = np.array([0])
         if images is not None:
             images = self.image_processor.fetch_images(images)
             images = make_flat_list_of_images(images)
@@ -174,23 +178,17 @@ def __call__(
 
         video_num_patches = []  # per frame
         video_pixel_values = None
-        video_patch_indices = np.array([0])
         video_num_patches_indices = np.array([0])
         if videos is not None:
             video_kwargs = output_kwargs["videos_kwargs"]
             video_inputs = self.video_processor(videos=videos, **video_kwargs)
             video_pixel_values = video_inputs.pop("pixel_values_videos")
 
-            batch_size, num_frames, *_ = video_pixel_values.shape
-            num_frames_per_video = np.full(batch_size, num_frames)
-            num_frames = sum(num_frames_per_video)  # total
-            video_patch_indices = np.empty(batch_size + 1, int)
-            video_patch_indices[0] = 0
-            video_patch_indices[1:] = np.cumsum(num_frames_per_video)
-            video_num_patches = [1] * num_frames
-            video_num_patches_indices = np.empty(num_frames + 1, int)
-            video_num_patches_indices[0] = 0
-            video_num_patches_indices[1:] = np.cumsum(video_num_patches)
+            # Obtain per frame information first and then flatten to (BS * T, ...)
+            num_frames_per_video = [len(video) for video in video_pixel_values]
+            video_num_patches = [1 for frames in num_frames_per_video for _ in range(frames)]
+            video_frame_indices = np.cumsum(num_frames_per_video)
+            video_num_patches_indices = np.cumsum(video_num_patches)
             video_pixel_values = video_pixel_values.flatten(0, 1)
 
         image_videos_inputs = {}
@@ -203,7 +201,7 @@ def __call__(
                 video_num_patches,
                 image_num_patches_indices,
                 video_num_patches_indices,
-                video_patch_indices,
+                video_frame_indices,
             )
             if images is not None and image_index != len(images):
                 raise ValueError("Number of image placeholders in the prompt does not match the number of images.")
diff --git a/src/transformers/models/jais2/modeling_jais2.py b/src/transformers/models/jais2/modeling_jais2.py
index 5e6a37c0172d..714f0512e17b 100644
--- a/src/transformers/models/jais2/modeling_jais2.py
+++ b/src/transformers/models/jais2/modeling_jais2.py
@@ -312,7 +312,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
index ae618fb4a2b3..67ca17549fac 100755
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -794,7 +794,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -802,7 +802,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -819,8 +821,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py
index ff2292a9153e..ba3d9d13bf5a 100644
--- a/src/transformers/models/janus/modeling_janus.py
+++ b/src/transformers/models/janus/modeling_janus.py
@@ -1018,9 +1018,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
index b9c5ba79d934..52478971620c 100644
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -783,9 +783,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py
index bc0558b097b3..3b25f93ee2c2 100644
--- a/src/transformers/models/janus/processing_janus.py
+++ b/src/transformers/models/janus/processing_janus.py
@@ -20,6 +20,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
+from .image_processing_janus import JanusImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
@@ -43,6 +44,7 @@ class JanusTextKwargs(TextKwargs, total=False):
 
 
 class JanusProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: JanusImageProcessorKwargs
     text_kwargs: JanusTextKwargs
     _defaults = {
         "text_kwargs": {"padding": False, "padding_side": "left", "generation_mode": "text"},
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
index d3ee0bb14875..20839b1ef3e5 100644
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -123,7 +123,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -700,7 +700,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -708,7 +708,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -725,8 +727,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py
index 63e4aed591fb..dd35d38cd749 100644
--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@@ -528,7 +528,7 @@ def __init__(self, config: Kosmos2VisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = Kosmos2VisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = Kosmos2VisionEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
 
@@ -547,7 +547,7 @@ def forward(
             raise ValueError("You have to specify pixel_values")
 
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
@@ -722,6 +722,7 @@ def forward(
         encoder_hidden_states: torch.Tensor | None = None,
         past_key_values: Cache | None = None,
         attention_mask: torch.Tensor | None = None,
+        is_causal: bool | None = None,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor | None, Cache | None]:
         """Input shape: Batch x Time x Channel"""
@@ -776,6 +777,7 @@ def forward(
             attention_mask,
             dropout=0.0 if not self.training else self.dropout,
             scaling=self.scaling,
+            is_causal=self.is_causal if is_causal is None else is_causal,
             **kwargs,
         )
 
@@ -1346,6 +1348,7 @@ def forward(self, features):
             encoder_hidden_states=key_value_states,
             past_key_values=None,
             attention_mask=None,
+            is_causal=False,
             output_attentions=None,
         )
 
diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
index b16274332baf..bd00139ec51b 100644
--- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py
@@ -326,7 +326,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/lasr/feature_extraction_lasr.py b/src/transformers/models/lasr/feature_extraction_lasr.py
index 7cf1822ee40d..26cacd39b09a 100644
--- a/src/transformers/models/lasr/feature_extraction_lasr.py
+++ b/src/transformers/models/lasr/feature_extraction_lasr.py
@@ -232,17 +232,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/lasr/modeling_lasr.py b/src/transformers/models/lasr/modeling_lasr.py
index 7ecea9099410..00bd26389d37 100644
--- a/src/transformers/models/lasr/modeling_lasr.py
+++ b/src/transformers/models/lasr/modeling_lasr.py
@@ -124,7 +124,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/lfm2/modeling_lfm2.py b/src/transformers/models/lfm2/modeling_lfm2.py
index ef753e3b2893..3e7a433442e3 100644
--- a/src/transformers/models/lfm2/modeling_lfm2.py
+++ b/src/transformers/models/lfm2/modeling_lfm2.py
@@ -124,7 +124,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py b/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py
index 0369ae31b8ae..e30c48ae2c1b 100644
--- a/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py
+++ b/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py
@@ -131,7 +131,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py
index b66ba44bef3f..f4934a4aba83 100755
--- a/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/modeling_lfm2_vl.py
@@ -225,10 +225,10 @@ def get_placeholder_mask(
             special_image_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         n_image_features = image_features.shape[0]
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/lfm2_vl/modular_lfm2_vl.py b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py
index 4cf94132367c..efd966886e64 100644
--- a/src/transformers/models/lfm2_vl/modular_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/modular_lfm2_vl.py
@@ -156,10 +156,10 @@ def get_placeholder_mask(
             special_image_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         n_image_features = image_features.shape[0]
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index bf654310d0d3..baf2744d7210 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -23,6 +23,7 @@
 )
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import auto_docstring, logging
+from .image_processing_lfm2_vl_fast import Lfm2VlImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
@@ -40,6 +41,7 @@ class Lfm2VlTextKwargs(TextKwargs, total=False):
 
 
 class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Lfm2VlImageProcessorKwargs
     text_kwargs: Lfm2VlTextKwargs
     _defaults = {
         "images_kwargs": {
diff --git a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py
index 946971c02119..306e78a94009 100644
--- a/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py
+++ b/src/transformers/models/lighton_ocr/modeling_lighton_ocr.py
@@ -205,9 +205,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/lighton_ocr/processing_lighton_ocr.py b/src/transformers/models/lighton_ocr/processing_lighton_ocr.py
index f7c189c3d849..546e9435140c 100644
--- a/src/transformers/models/lighton_ocr/processing_lighton_ocr.py
+++ b/src/transformers/models/lighton_ocr/processing_lighton_ocr.py
@@ -26,9 +26,11 @@
 from ...image_utils import ChannelDimension, ImageInput, get_image_size
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ..pixtral.image_processing_pixtral import PixtralImageProcessorKwargs
 
 
 class LightOnOcrProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: PixtralImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 9d659c7c6f08..a10927545756 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -127,7 +127,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 366e50d74ec2..10caed8de8fa 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from tokenizers import Tokenizer, decoders, pre_tokenizers
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
 from tokenizers.models import BPE
 
 from ...tokenization_utils_base import _get_prepend_scheme
@@ -116,10 +116,16 @@ def __init__(
         self._tokenizer = Tokenizer(
             BPE(vocab=self._vocab, merges=self._merges, fuse_unk=True, byte_fallback=True, dropout=None)
         )
-        self._tokenizer.normalizer = None
-        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
-            replacement="▁", prepend_scheme=_get_prepend_scheme(self.add_prefix_space, self), split=False
-        )
+        if not self.legacy:
+            self._tokenizer.normalizer = None
+            self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
+                replacement="▁", prepend_scheme=_get_prepend_scheme(self.add_prefix_space, self), split=False
+            )
+        else:
+            self._tokenizer.pre_tokenizer = None
+            self._tokenizer.normalizer = normalizers.Sequence(
+                [normalizers.Prepend(prepend="▁"), normalizers.Replace(pattern=" ", content="▁")]
+            )
 
         sequence = [
             decoders.Replace("▁", " "),
diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py
index 08d50bd63f72..fb6a5ef22ce5 100644
--- a/src/transformers/models/llama4/modeling_llama4.py
+++ b/src/transformers/models/llama4/modeling_llama4.py
@@ -1240,9 +1240,9 @@ def get_placeholder_mask(
             special_image_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
         )
         return special_image_mask
diff --git a/src/transformers/models/llama4/processing_llama4.py b/src/transformers/models/llama4/processing_llama4.py
index f67e37a1e80a..51f0fe318e1e 100644
--- a/src/transformers/models/llama4/processing_llama4.py
+++ b/src/transformers/models/llama4/processing_llama4.py
@@ -19,9 +19,11 @@
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, make_flat_list_of_images
 from ...utils import auto_docstring
+from .image_processing_llama4_fast import Llama4ImageProcessorKwargs
 
 
 class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Llama4ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding_side": "left",
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index f17041dca72b..05022ed70af0 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -211,9 +211,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 2443669f109b..bac18d38e8b0 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -432,9 +432,9 @@ def get_placeholder_mask(
             special_image_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
         )
         return special_image_mask
@@ -634,7 +634,7 @@ def forward(
         hidden_states = outputs[0]
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = self.lm_head(hidden_states[:, slice_indices, :].to(self.lm_head.weight.dtype))
 
         loss = None
         if labels is not None:
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 5208ae2713ee..2db703e5c25b 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -26,12 +26,14 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
+from .image_processing_llava_next import LlavaNextImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class LlavaNextProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: LlavaNextImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 5e20ab888db7..f573d956fe83 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -495,18 +495,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index c5d521e5034b..b8b706f6f26c 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -375,18 +375,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index 543898f29fd1..8a9033d2c521 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -24,12 +24,14 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ...video_utils import VideoInput
+from ..llava_next.image_processing_llava_next import LlavaNextImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class LlavaNextVideoProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: LlavaNextImageProcessorKwargs
     # see processing_utils.ProcessingKwargs documentation for usage.
     _defaults = {
         "text_kwargs": {
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index a6ef4131de48..40af33d936b0 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -456,18 +456,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index ca5f6e3a5bd1..9ae9b6c73986 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -27,12 +27,14 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ...video_utils import VideoInput
+from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: LlavaOnevisionImageProcessorKwargs
     # see processing_utils.ProcessingKwargs documentation for usage.
     _defaults = {
         "text_kwargs": {
diff --git a/src/transformers/models/longcat_flash/modeling_longcat_flash.py b/src/transformers/models/longcat_flash/modeling_longcat_flash.py
index d5ac6e237742..6fe7d0711f8d 100644
--- a/src/transformers/models/longcat_flash/modeling_longcat_flash.py
+++ b/src/transformers/models/longcat_flash/modeling_longcat_flash.py
@@ -122,7 +122,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -356,7 +356,7 @@ def __init__(self, config, layer_idx: int):
             self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
         else:
             self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
-            self.q_a_layernorm = LongcatFlashRMSNorm(config.q_lora_rank)
+            self.q_a_layernorm = LongcatFlashRMSNorm(config.q_lora_rank, eps=config.rms_norm_eps)
             self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
 
         self.kv_a_proj_with_mqa = nn.Linear(
@@ -364,7 +364,7 @@ def __init__(self, config, layer_idx: int):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=config.attention_bias,
         )
-        self.kv_a_layernorm = LongcatFlashRMSNorm(self.kv_lora_rank)
+        self.kv_a_layernorm = LongcatFlashRMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = nn.Linear(
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index b20646bc2cb4..4984da03e760 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -2433,6 +2433,15 @@ def forward(
         torch.Size([338, 676])
         ```
         """
+
+        if mask_labels is not None:
+            target_device = pixel_values.device
+            mask_labels = [mask.to(target_device) for mask in mask_labels]
+
+        if class_labels is not None:
+            target_device = pixel_values.device
+            class_labels = [label.to(target_device) for label in class_labels]
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 788775a52fcb..ab089203d04c 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -2003,6 +2003,13 @@ def forward(
         [480, 640]
         ```
         """
+        if mask_labels is not None:
+            target_device = pixel_values.device
+            mask_labels = [mask.to(target_device) for mask in mask_labels]
+
+        if class_labels is not None:
+            target_device = pixel_values.device
+            class_labels = [label.to(target_device) for label in class_labels]
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py
index abfd4de8c24a..7216cf2c38e6 100644
--- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py
+++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py
@@ -949,7 +949,7 @@ def __init__(self, config: MetaClip2VisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = MetaClip2VisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = MetaClip2Encoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.post_init()
@@ -986,7 +986,7 @@ def forward(
         >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
         hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs: BaseModelOutput = self.encoder(
             inputs_embeds=hidden_states,
diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
index 30480d1d1c03..f4a72a7d6dd7 100644
--- a/src/transformers/models/mimi/modeling_mimi.py
+++ b/src/transformers/models/mimi/modeling_mimi.py
@@ -487,12 +487,21 @@ def __init__(self, config: MimiConfig):
             conv_layer = self.get_submodule(layername)
             setattr(conv_layer, "layer_idx", layer_idx)
 
-    def forward(self, hidden_states, padding_cache=None):
+    def forward(self, hidden_states, padding_cache=None, output_lengths=None):
         for layer in self.layers:
             if isinstance(layer, (MimiConv1d, MimiResnetBlock)):
                 hidden_states = layer(hidden_states, padding_cache=padding_cache)
             else:
                 hidden_states = layer(hidden_states)
+            # zero out positions after valid lengths so that garbage from conv bias
+            # does not leak into boundary positions at later strided convolutions.
+            if output_lengths is not None:
+                if isinstance(layer, MimiConv1d):
+                    output_lengths = layer._get_output_length(output_lengths)
+                time_mask = torch.arange(
+                    hidden_states.shape[-1], device=hidden_states.device
+                ) < output_lengths.unsqueeze(1)
+                hidden_states = hidden_states * time_mask.unsqueeze(1)
         return hidden_states
 
 
@@ -569,7 +578,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -1466,12 +1475,26 @@ def _encode_frame(
         Encodes the given input using the underlying VQVAE. The padding mask is required to compute the correct scale.
         """
 
-        # TODO: @eustlb, let's make the encoder support padding_mask so that batched inputs are supported.
-        embeddings = self.encoder(input_values, padding_cache=padding_cache)
+        input_lengths = None
+        if padding_mask is not None and padding_cache is None:
+            padding_mask_2d = padding_mask.any(dim=1) if padding_mask.dim() == 3 else padding_mask
+            input_lengths = padding_mask_2d.sum(dim=-1)
+        embeddings = self.encoder(input_values, padding_cache=padding_cache, output_lengths=input_lengths)
+        attention_mask = None
+        encoder_output_lengths = None
+        if input_lengths is not None:
+            encoder_output_lengths = input_lengths
+            for layer_name in self.encoder._mimiconv1d_layer_names:
+                encoder_output_lengths = self.encoder.get_submodule(layer_name)._get_output_length(
+                    encoder_output_lengths
+                )
+            attention_mask = torch.arange(embeddings.shape[-1], device=embeddings.device).unsqueeze(
+                0
+            ) < encoder_output_lengths.unsqueeze(1)
 
-        # TODO: @eustlb, convert the padding mask to attention mask.
         encoder_outputs = self.encoder_transformer(
             embeddings.transpose(1, 2),
+            attention_mask=attention_mask,
             past_key_values=past_key_values,
             use_cache=use_streaming,
             return_dict=return_dict,
@@ -1481,10 +1504,18 @@ def _encode_frame(
         elif len(encoder_outputs) > 1:
             past_key_values = encoder_outputs[1]
         embeddings = encoder_outputs[0].transpose(1, 2)
-        embeddings = self.downsample(embeddings, padding_cache=padding_cache)
 
+        if encoder_output_lengths is not None:
+            last_valid_idx = (encoder_output_lengths - 1).clamp(min=0)
+            last_valid_emb = embeddings.gather(2, last_valid_idx.view(-1, 1, 1).expand(-1, embeddings.shape[1], 1))
+            garbage_mask = torch.arange(embeddings.shape[-1], device=embeddings.device).unsqueeze(
+                0
+            ) >= encoder_output_lengths.unsqueeze(1)
+            embeddings = torch.where(garbage_mask.unsqueeze(1), last_valid_emb, embeddings)
+        embeddings = self.downsample(embeddings, padding_cache=padding_cache)
         codes = self.quantizer.encode(embeddings, num_quantizers)
         codes = codes.transpose(0, 1)
+
         return codes, past_key_values, padding_cache
 
     def get_encoded_length(self, input_length: torch.LongTensor) -> torch.LongTensor:
diff --git a/src/transformers/models/minimax/modeling_minimax.py b/src/transformers/models/minimax/modeling_minimax.py
index 69497f83cad8..da5f485f645d 100644
--- a/src/transformers/models/minimax/modeling_minimax.py
+++ b/src/transformers/models/minimax/modeling_minimax.py
@@ -315,7 +315,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -740,7 +740,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -748,13 +748,17 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
-        batch_size, sequence_length = attention_mask.shape
-        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        num_hidden_layers = len(gate_logits)
+        batch_size = attention_mask.shape[0]
+        sequence_length = gate_logits[0].shape[0] // batch_size
+        attention_mask = attention_mask[:, -sequence_length:]
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
         expert_attention_mask = (
@@ -765,8 +769,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/minimax_m2/modeling_minimax_m2.py b/src/transformers/models/minimax_m2/modeling_minimax_m2.py
index d19274262810..2abf1aeb20fb 100644
--- a/src/transformers/models/minimax_m2/modeling_minimax_m2.py
+++ b/src/transformers/models/minimax_m2/modeling_minimax_m2.py
@@ -539,7 +539,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -547,7 +547,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -564,8 +566,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/ministral/modeling_ministral.py b/src/transformers/models/ministral/modeling_ministral.py
index af4f7fbeae59..c5f817aa050d 100644
--- a/src/transformers/models/ministral/modeling_ministral.py
+++ b/src/transformers/models/ministral/modeling_ministral.py
@@ -336,7 +336,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/ministral3/modeling_ministral3.py b/src/transformers/models/ministral3/modeling_ministral3.py
index 6aacf4c8ce3a..866bfa3a9dfc 100644
--- a/src/transformers/models/ministral3/modeling_ministral3.py
+++ b/src/transformers/models/ministral3/modeling_ministral3.py
@@ -327,7 +327,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index b79dea36c9e9..117683868d8e 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -316,7 +316,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/mistral3/modeling_mistral3.py b/src/transformers/models/mistral3/modeling_mistral3.py
index 03ad4e247770..9887c1b9f6e4 100644
--- a/src/transformers/models/mistral3/modeling_mistral3.py
+++ b/src/transformers/models/mistral3/modeling_mistral3.py
@@ -268,9 +268,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/mistral4/configuration_mistral4.py b/src/transformers/models/mistral4/configuration_mistral4.py
index 0e16e0a14f45..04a3ef34e6dc 100644
--- a/src/transformers/models/mistral4/configuration_mistral4.py
+++ b/src/transformers/models/mistral4/configuration_mistral4.py
@@ -103,11 +103,12 @@ class Mistral4Config(PreTrainedConfig):
 
     def __post_init__(self, **kwargs):
         if self.rope_parameters is None:
+            default_rope_factor = 128.0
             self.rope_parameters = {
                 "type": "yarn",
                 "rope_theta": 10000.0,
-                "factor": 128.0,
-                "original_max_position_embeddings": 8192,
+                "factor": default_rope_factor,
+                "original_max_position_embeddings": max(1, int(self.max_position_embeddings / default_rope_factor)),
                 "max_position_embeddings": self.max_position_embeddings,
                 "beta_fast": 32.0,
                 "beta_slow": 1.0,
diff --git a/src/transformers/models/mistral4/modeling_mistral4.py b/src/transformers/models/mistral4/modeling_mistral4.py
index 006ddad187bf..8f89e0c6a029 100644
--- a/src/transformers/models/mistral4/modeling_mistral4.py
+++ b/src/transformers/models/mistral4/modeling_mistral4.py
@@ -17,8 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 from collections.abc import Callable
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -89,9 +89,9 @@ def __init__(self, config: Mistral4Config, device=None):
     @staticmethod
     def compute_default_rope_parameters(
         config: Mistral4Config | None = None,
-        device: Optional["torch.device"] = None,
+        device=None,
         seq_len: int | None = None,
-    ) -> tuple["torch.Tensor", float]:
+    ) -> tuple[torch.Tensor, float]:
         """
         Computes the inverse frequencies according to the original RoPE implementation
         Args:
@@ -106,11 +106,10 @@ def compute_default_rope_parameters(
             post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
         """
         base = config.rope_parameters["rope_theta"]
+        partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0)
         dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
-
+        dim = int(dim * partial_rotary_factor)  # Mixtral4 doesn't apply ROPE to the full attention head
         attention_factor = 1.0  # Unused in this type of RoPE
-
-        # Compute the inverse frequencies
         inv_freq = 1.0 / (
             base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
         )
@@ -363,6 +362,12 @@ def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze
     return q_embed, k_embed
 
 
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
 def get_llama_4_attn_scale(positions_ids: torch.Tensor, beta: float, max_position_embeddings: int) -> torch.Tensor:
     scaling = 1 + beta * torch.log(1 + torch.floor(positions_ids / max_position_embeddings))
     return scaling[:, None, :, None]
@@ -413,6 +418,12 @@ def __init__(self, config: Mistral4Config, layer_idx: int):
         )
 
         self.scaling = self.qk_head_dim ** (-0.5)
+        if self.config.rope_parameters.get("rope_type", "default") == "yarn":
+            mscale_all_dim = self.config.rope_parameters.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_parameters["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.scaling = self.scaling * mscale * mscale
 
     def forward(
         self,
@@ -544,7 +555,7 @@ class Mistral4PreTrainedModel(PreTrainedModel):
     _supports_sdpa = True
     _supports_flex_attn = True
 
-    _can_compile_fullgraph = True
+    _can_compile_fullgraph = False
     _supports_attention_backend = True
     _can_record_outputs = {
         "hidden_states": Mistral4DecoderLayer,
diff --git a/src/transformers/models/mistral4/modular_mistral4.py b/src/transformers/models/mistral4/modular_mistral4.py
index acd9f1f60191..931686859b6f 100644
--- a/src/transformers/models/mistral4/modular_mistral4.py
+++ b/src/transformers/models/mistral4/modular_mistral4.py
@@ -31,6 +31,7 @@
     DeepseekV3MoE,
     DeepseekV3NaiveMoe,
     apply_rotary_pos_emb_interleave,
+    yarn_get_mscale,
 )
 from ..llama.modeling_llama import (
     LlamaForCausalLM,
@@ -53,7 +54,21 @@ class Mistral4RMSNorm(LlamaRMSNorm):
 
 
 class Mistral4RotaryEmbedding(LlamaRotaryEmbedding):
-    pass
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: Mistral4Config | None = None,
+        device=None,
+        seq_len: int | None = None,
+    ) -> tuple[torch.Tensor, float]:
+        base = config.rope_parameters["rope_theta"]
+        partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0)
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        dim = int(dim * partial_rotary_factor)  # Mixtral4 doesn't apply ROPE to the full attention head
+        attention_factor = 1.0  # Unused in this type of RoPE
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
 
 
 class Mistral4MLP(Qwen2MoeMLP):
@@ -145,6 +160,12 @@ def __init__(self, config: Mistral4Config, layer_idx: int):
         )
 
         self.scaling = self.qk_head_dim ** (-0.5)
+        if self.config.rope_parameters.get("rope_type", "default") == "yarn":
+            mscale_all_dim = self.config.rope_parameters.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_parameters["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.scaling = self.scaling * mscale * mscale
 
     def forward(
         self,
@@ -245,7 +266,7 @@ class Mistral4PreTrainedModel(PreTrainedModel):
     _supports_sdpa = True
     _supports_flex_attn = True
 
-    _can_compile_fullgraph = True
+    _can_compile_fullgraph = False
     _supports_attention_backend = True
     _can_record_outputs = {
         "hidden_states": Mistral4DecoderLayer,
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 991851dbadd3..4a8ff49a87a4 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -78,15 +78,17 @@ def forward(
         top_k_weights: torch.Tensor,
     ) -> torch.Tensor:
         final_hidden_states = torch.zeros_like(hidden_states)
-        with torch.no_grad():
-            expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
-            expert_mask = expert_mask.permute(2, 1, 0)
-            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-
-        for expert_idx in expert_hit:
-            expert_idx = expert_idx[0]
-            if expert_idx == self.num_experts:
-                continue
+        expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
+        expert_mask = expert_mask.permute(2, 1, 0)
+
+        if self.training:
+            with torch.no_grad():
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            expert_indices = [expert_idx[0] for expert_idx in expert_hit]
+        else:
+            expert_indices = range(self.num_experts)
+
+        for expert_idx in expert_indices:
             top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
             current_state = hidden_states[token_idx]
             gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
@@ -213,7 +215,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -531,7 +533,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -539,13 +541,17 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
-        batch_size, sequence_length = attention_mask.shape
-        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        num_hidden_layers = len(gate_logits)
+        batch_size = attention_mask.shape[0]
+        sequence_length = gate_logits[0].shape[0] // batch_size
+        attention_mask = attention_mask[:, -sequence_length:]
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
         expert_attention_mask = (
@@ -556,8 +562,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/mixtral/modular_mixtral.py b/src/transformers/models/mixtral/modular_mixtral.py
index 139e580fbca7..52b6dddcf500 100644
--- a/src/transformers/models/mixtral/modular_mixtral.py
+++ b/src/transformers/models/mixtral/modular_mixtral.py
@@ -86,7 +86,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -94,13 +94,17 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
-        batch_size, sequence_length = attention_mask.shape
-        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        num_hidden_layers = len(gate_logits)
+        batch_size = attention_mask.shape[0]
+        sequence_length = gate_logits[0].shape[0] // batch_size
+        attention_mask = attention_mask[:, -sequence_length:]
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
         expert_attention_mask = (
@@ -111,8 +115,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
@@ -152,15 +158,17 @@ def forward(
         top_k_weights: torch.Tensor,
     ) -> torch.Tensor:
         final_hidden_states = torch.zeros_like(hidden_states)
-        with torch.no_grad():
-            expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
-            expert_mask = expert_mask.permute(2, 1, 0)
-            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-
-        for expert_idx in expert_hit:
-            expert_idx = expert_idx[0]
-            if expert_idx == self.num_experts:
-                continue
+        expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
+        expert_mask = expert_mask.permute(2, 1, 0)
+
+        if self.training:
+            with torch.no_grad():
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            expert_indices = [expert_idx[0] for expert_idx in expert_hit]
+        else:
+            expert_indices = range(self.num_experts)
+
+        for expert_idx in expert_indices:
             top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
             current_state = hidden_states[token_idx]
             gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
diff --git a/src/transformers/models/mlcd/modeling_mlcd.py b/src/transformers/models/mlcd/modeling_mlcd.py
index ea7d87224faf..a764fdc7e289 100644
--- a/src/transformers/models/mlcd/modeling_mlcd.py
+++ b/src/transformers/models/mlcd/modeling_mlcd.py
@@ -465,7 +465,7 @@ def __init__(self, config: MLCDVisionConfig):
         embed_dim = config.hidden_size
 
         self.embeddings = MLCDVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.encoder = MLCDEncoder(config)
         self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
@@ -516,7 +516,7 @@ def forward(
         position_embeddings = (emb.cos(), emb.sin())
 
         hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
diff --git a/src/transformers/models/mlcd/modular_mlcd.py b/src/transformers/models/mlcd/modular_mlcd.py
index 0ffbf80f01b4..dcec4d3a934a 100644
--- a/src/transformers/models/mlcd/modular_mlcd.py
+++ b/src/transformers/models/mlcd/modular_mlcd.py
@@ -385,7 +385,7 @@ def forward(
         position_embeddings = (emb.cos(), emb.sin())
 
         hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.pre_layrnorm(hidden_states)
+        hidden_states = self.pre_layernorm(hidden_states)
 
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 2a604b4cf0b0..114818655abe 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -21,9 +21,11 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
+from .image_processing_mllama import MllamaImageProcessorKwargs
 
 
 class MllamaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: MllamaImageProcessorKwargs
     _defaults = {
         "image_kwargs": {
             "max_image_tiles": 4,
diff --git a/src/transformers/models/mm_grounding_dino/convert_mm_grounding_dino_to_hf.py b/src/transformers/models/mm_grounding_dino/convert_mm_grounding_dino_to_hf.py
index efeeabe5cc23..8f589f8321a8 100644
--- a/src/transformers/models/mm_grounding_dino/convert_mm_grounding_dino_to_hf.py
+++ b/src/transformers/models/mm_grounding_dino/convert_mm_grounding_dino_to_hf.py
@@ -376,7 +376,7 @@ def preprocess_old_state(state_dict: dict, config: MMGroundingDinoConfig) -> dic
         if (
             k == "dn_query_generator.label_embedding.weight"
             or k == "language_model.language_backbone.body.model.embeddings.position_ids"
-            or k == "image_seperate.weight"
+            or k == "image_separate.weight"
             or k.startswith("lmm")
             or k.startswith("connector")
             or k.startswith("region_connector")
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index d94c1912fbd9..2efd86398b2f 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -144,9 +144,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/mobilevit/image_processing_pil_mobilevit.py b/src/transformers/models/mobilevit/image_processing_pil_mobilevit.py
index 893e27fe4ccf..f6031a740eae 100644
--- a/src/transformers/models/mobilevit/image_processing_pil_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_pil_mobilevit.py
@@ -142,9 +142,10 @@ def _preprocess_image_like_inputs(
 
     def reduce_label(self, image: np.ndarray) -> np.ndarray:
         """Reduce label values by 1, replacing 0 with 255."""
-        image[image == 0] = 255
-        image = image - 1
-        image[image == 254] = 255
+        image = image.copy()
+        ignore_mask = (image == 0) | (image == 255)
+        image[ignore_mask] = 255
+        image[~ignore_mask] = image[~ignore_mask] - 1
         return image
 
     def flip_channel_order(self, image: np.ndarray) -> np.ndarray:
diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py
index 2900898a7991..a1d454db252c 100644
--- a/src/transformers/models/moonshine/modeling_moonshine.py
+++ b/src/transformers/models/moonshine/modeling_moonshine.py
@@ -148,7 +148,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py
index a967445c18ec..a3987604f689 100644
--- a/src/transformers/models/moshi/modeling_moshi.py
+++ b/src/transformers/models/moshi/modeling_moshi.py
@@ -323,7 +323,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
index 41e555d7fd18..2dbd4a68d641 100644
--- a/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
+++ b/src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py
@@ -256,7 +256,7 @@ def merge_and_shard_weights(src_root: Path, dst_root: Path, processor: MusicFlam
   --dst_dir music-flamingo-2601-hf
 ```
 
-3) Convert and push directly to the Hub (requires `huggingface-cli login` or `HF_TOKEN`):
+3) Convert and push directly to the Hub (requires `hf auth login` or `HF_TOKEN`):
 
 ```
 python src/transformers/models/musicflamingo/convert_musicflamingo_to_hf.py \
diff --git a/src/transformers/models/nanochat/modeling_nanochat.py b/src/transformers/models/nanochat/modeling_nanochat.py
index 9205b89cd360..11a60c7562fa 100644
--- a/src/transformers/models/nanochat/modeling_nanochat.py
+++ b/src/transformers/models/nanochat/modeling_nanochat.py
@@ -114,7 +114,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py
index b91b45ffd183..adedd4eec7cd 100644
--- a/src/transformers/models/nemotron/modeling_nemotron.py
+++ b/src/transformers/models/nemotron/modeling_nemotron.py
@@ -143,7 +143,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/nemotron_h/modeling_nemotron_h.py b/src/transformers/models/nemotron_h/modeling_nemotron_h.py
index 93bd47f2c3f4..0c59c411af88 100644
--- a/src/transformers/models/nemotron_h/modeling_nemotron_h.py
+++ b/src/transformers/models/nemotron_h/modeling_nemotron_h.py
@@ -974,22 +974,27 @@ def _init_weights(self, module):
         """Initialize the weights."""
         super()._init_weights(module)
         if isinstance(module, NemotronHMamba2Mixer):
-            # Initialize A_log and D parameters
-            A = torch.arange(1, self.config.mamba_num_heads + 1)
-            init.copy_(module.A_log, torch.log(A))
-            init.ones_(module.D)
-
-            dt = torch.exp(
-                torch.rand(self.config.mamba_num_heads)
-                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
-                + math.log(self.config.time_step_min)
-            ).clamp(min=self.config.time_step_floor)
-
-            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-            inv_dt = dt + torch.log(-torch.expm1(-dt))
-            with torch.no_grad():
-                init.copy_(module.dt_bias, inv_dt)
-            module.dt_bias._no_reinit = True
+            # Only re-initialise params that were NOT loaded from a checkpoint.
+            # `_is_hf_initialized` is set by `from_pretrained` on each loaded
+            # parameter; without this guard a post-load safety pass of
+            # `_init_weights` would overwrite checkpoint values of
+            # A_log / D / dt_bias with fresh random draws.
+            if not getattr(module.A_log, "_is_hf_initialized", False):
+                A = torch.arange(1, self.config.mamba_num_heads + 1)
+                init.copy_(module.A_log, torch.log(A))
+            if not getattr(module.D, "_is_hf_initialized", False):
+                init.ones_(module.D)
+            if not getattr(module.dt_bias, "_is_hf_initialized", False):
+                dt = torch.exp(
+                    torch.rand(self.config.mamba_num_heads)
+                    * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                    + math.log(self.config.time_step_min)
+                ).clamp(min=self.config.time_step_floor)
+
+                # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+                inv_dt = dt + torch.log(-torch.expm1(-dt))
+                with torch.no_grad():
+                    init.copy_(module.dt_bias, inv_dt)
         elif isinstance(module, NemotronHTopkRouter):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             init.zeros_(module.e_score_correction_bias)
@@ -1014,10 +1019,12 @@ def _init_weights(self, module):
             # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
             for name, p in module.named_parameters():
                 if name == "out_proj.weight":
+                    # Skip checkpoint-loaded weights so a post-load safety
+                    # pass of `_init_weights` doesn't silently overwrite them.
+                    if getattr(p, "_is_hf_initialized", False):
+                        continue
                     # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                     # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                    # We need to reinit p since this code could be called multiple times
-                    # Having just p *= scale would repeatedly scale it down
                     init.kaiming_uniform_(p, a=math.sqrt(5))
                     with torch.no_grad():
                         p_new = p / math.sqrt(self.config.num_hidden_layers)
diff --git a/src/transformers/models/nemotron_h/modular_nemotron_h.py b/src/transformers/models/nemotron_h/modular_nemotron_h.py
index 803e5c638239..3cf46e97d097 100644
--- a/src/transformers/models/nemotron_h/modular_nemotron_h.py
+++ b/src/transformers/models/nemotron_h/modular_nemotron_h.py
@@ -327,22 +327,27 @@ def _init_weights(self, module):
         """Initialize the weights."""
         super()._init_weights(module)
         if isinstance(module, NemotronHMamba2Mixer):
-            # Initialize A_log and D parameters
-            A = torch.arange(1, self.config.mamba_num_heads + 1)
-            init.copy_(module.A_log, torch.log(A))
-            init.ones_(module.D)
-
-            dt = torch.exp(
-                torch.rand(self.config.mamba_num_heads)
-                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
-                + math.log(self.config.time_step_min)
-            ).clamp(min=self.config.time_step_floor)
-
-            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-            inv_dt = dt + torch.log(-torch.expm1(-dt))
-            with torch.no_grad():
-                init.copy_(module.dt_bias, inv_dt)
-            module.dt_bias._no_reinit = True
+            # Only re-initialise params that were NOT loaded from a checkpoint.
+            # `_is_hf_initialized` is set by `from_pretrained` on each loaded
+            # parameter; without this guard a post-load safety pass of
+            # `_init_weights` would overwrite checkpoint values of
+            # A_log / D / dt_bias with fresh random draws.
+            if not getattr(module.A_log, "_is_hf_initialized", False):
+                A = torch.arange(1, self.config.mamba_num_heads + 1)
+                init.copy_(module.A_log, torch.log(A))
+            if not getattr(module.D, "_is_hf_initialized", False):
+                init.ones_(module.D)
+            if not getattr(module.dt_bias, "_is_hf_initialized", False):
+                dt = torch.exp(
+                    torch.rand(self.config.mamba_num_heads)
+                    * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                    + math.log(self.config.time_step_min)
+                ).clamp(min=self.config.time_step_floor)
+
+                # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+                inv_dt = dt + torch.log(-torch.expm1(-dt))
+                with torch.no_grad():
+                    init.copy_(module.dt_bias, inv_dt)
         elif isinstance(module, NemotronHTopkRouter):
             init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
             init.zeros_(module.e_score_correction_bias)
@@ -367,10 +372,12 @@ def _init_weights(self, module):
             # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
             for name, p in module.named_parameters():
                 if name == "out_proj.weight":
+                    # Skip checkpoint-loaded weights so a post-load safety
+                    # pass of `_init_weights` doesn't silently overwrite them.
+                    if getattr(p, "_is_hf_initialized", False):
+                        continue
                     # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                     # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
-                    # We need to reinit p since this code could be called multiple times
-                    # Having just p *= scale would repeatedly scale it down
                     init.kaiming_uniform_(p, a=math.sqrt(5))
                     with torch.no_grad():
                         p_new = p / math.sqrt(self.config.num_hidden_layers)
diff --git a/src/transformers/models/olmo3/modeling_olmo3.py b/src/transformers/models/olmo3/modeling_olmo3.py
index 78ade3570f97..cd9e93dad779 100644
--- a/src/transformers/models/olmo3/modeling_olmo3.py
+++ b/src/transformers/models/olmo3/modeling_olmo3.py
@@ -322,7 +322,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py b/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py
index 5c76f0a8ca22..0ebaf987b13d 100644
--- a/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py
+++ b/src/transformers/models/olmo_hybrid/modeling_olmo_hybrid.py
@@ -628,6 +628,24 @@ def torch_recurrent_gated_delta_rule(
 )
 
 
+def _cu_seqlens_from_packed_mask(attention_mask: torch.Tensor) -> torch.Tensor:
+    """Derive ``cu_seqlens`` from a packed attention mask with unique sequence IDs.
+
+    For a mask like ``[1, 1, 1, 2, 2, 0, 0]``, returns ``cu_seqlens = [0, 3, 5]``
+    (ignoring padding).  For a standard ``0/1`` mask, returns ``[0, num_ones]``.
+    """
+    flat = attention_mask.flatten()
+    non_pad = flat > 0
+    non_pad_ids = flat[non_pad]
+    if len(non_pad_ids) == 0:
+        return torch.tensor([0], dtype=torch.int32, device=attention_mask.device)
+    boundaries = torch.where(non_pad_ids[1:] != non_pad_ids[:-1])[0] + 1
+    cu_seqlens = torch.zeros(len(boundaries) + 2, dtype=torch.int32, device=attention_mask.device)
+    cu_seqlens[1:-1] = boundaries
+    cu_seqlens[-1] = len(non_pad_ids)
+    return cu_seqlens
+
+
 class OlmoHybridGatedDeltaNet(nn.Module):
     """
     GatedDeltaNet linear attention for OLMo Hybrid.
@@ -726,9 +744,6 @@ def forward(
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
-        # Requires LEFT padding to work correctly
-        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
-
         batch_size, seq_len, _ = hidden_states.shape
 
         use_cache = cache_params is not None
@@ -737,6 +752,21 @@ def forward(
         # below, each of which gates on `seq_len == 1` locally.
         use_precomputed = use_cache and cache_params.has_previous_state()
 
+        # For packed sequences (attention_mask with unique sequence IDs > 1), derive
+        # cu_seqlens and unpad so recurrent state doesn't leak across sequence boundaries.
+        # Requires the FLA fast path; torch fallbacks don't support cu_seqlens.
+        cu_seqlens = None
+        unpad_indices = None
+        if attention_mask is not None and not use_precomputed and is_fast_path_available and attention_mask.max() > 1:
+            cu_seqlens = _cu_seqlens_from_packed_mask(attention_mask)
+            unpad_indices = attention_mask.flatten() > 0
+            hidden_states = hidden_states[:, unpad_indices, :]
+        else:
+            # Requires LEFT padding to work correctly
+            hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+
+        effective_batch, effective_len, _ = hidden_states.shape
+
         conv_state_q = cache_params.conv_states_q[self.layer_idx] if cache_params else None
         conv_state_k = cache_params.conv_states_k[self.layer_idx] if cache_params else None
         conv_state_v = cache_params.conv_states_v[self.layer_idx] if cache_params else None
@@ -747,13 +777,13 @@ def forward(
         v = self.v_proj(hidden_states)
 
         q, new_conv_state_q = self.q_conv1d(
-            q, cache=conv_state_q, use_precomputed=use_precomputed, output_final_state=use_cache
+            q, cache=conv_state_q, use_precomputed=use_precomputed, output_final_state=use_cache, cu_seqlens=cu_seqlens
         )
         k, new_conv_state_k = self.k_conv1d(
-            k, cache=conv_state_k, use_precomputed=use_precomputed, output_final_state=use_cache
+            k, cache=conv_state_k, use_precomputed=use_precomputed, output_final_state=use_cache, cu_seqlens=cu_seqlens
         )
         v, new_conv_state_v = self.v_conv1d(
-            v, cache=conv_state_v, use_precomputed=use_precomputed, output_final_state=use_cache
+            v, cache=conv_state_v, use_precomputed=use_precomputed, output_final_state=use_cache, cu_seqlens=cu_seqlens
         )
 
         if cache_params is not None:
@@ -761,9 +791,9 @@ def forward(
             cache_params.conv_states_k[self.layer_idx] = new_conv_state_k
             cache_params.conv_states_v[self.layer_idx] = new_conv_state_v
 
-        q = q.view(batch_size, seq_len, -1, self.head_k_dim)
-        k = k.view(batch_size, seq_len, -1, self.head_k_dim)
-        v = v.view(batch_size, seq_len, -1, self.head_v_dim)
+        q = q.view(effective_batch, effective_len, -1, self.head_k_dim)
+        k = k.view(effective_batch, effective_len, -1, self.head_k_dim)
+        v = v.view(effective_batch, effective_len, -1, self.head_v_dim)
 
         if self.num_v_heads > self.num_k_heads:
             expand_ratio = self.num_v_heads // self.num_k_heads
@@ -788,6 +818,7 @@ def forward(
                 use_qk_l2norm_in_kernel=True,
             )
         else:
+            chunk_extra_kwargs = {"cu_seqlens": cu_seqlens} if cu_seqlens is not None else {}
             output, new_recurrent_state = self.chunk_gated_delta_rule(
                 q,
                 k,
@@ -797,6 +828,7 @@ def forward(
                 initial_state=recurrent_state if use_precomputed else None,
                 output_final_state=use_cache,
                 use_qk_l2norm_in_kernel=True,
+                **chunk_extra_kwargs,
             )
 
         if cache_params is not None:
@@ -806,10 +838,16 @@ def forward(
         output = output.reshape(-1, self.head_v_dim)
         gate = gate.reshape(-1, self.head_v_dim)
         output = self.o_norm(output, gate)
-        output = output.reshape(batch_size, seq_len, -1)
+        output = output.reshape(effective_batch, effective_len, -1)
 
         output = self.o_proj(output)
 
+        # Re-pad output to original shape for packed sequences
+        if unpad_indices is not None:
+            output_padded = output.new_zeros(batch_size, seq_len, output.shape[-1])
+            output_padded[:, unpad_indices, :] = output
+            output = output_padded
+
         return output
 
 
diff --git a/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py b/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py
index 7e40d6d61f5d..967f54ab79eb 100644
--- a/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py
+++ b/src/transformers/models/olmo_hybrid/modular_olmo_hybrid.py
@@ -444,6 +444,24 @@ def forward(self, x, position_ids):
         return cos, sin
 
 
+def _cu_seqlens_from_packed_mask(attention_mask: torch.Tensor) -> torch.Tensor:
+    """Derive ``cu_seqlens`` from a packed attention mask with unique sequence IDs.
+
+    For a mask like ``[1, 1, 1, 2, 2, 0, 0]``, returns ``cu_seqlens = [0, 3, 5]``
+    (ignoring padding).  For a standard ``0/1`` mask, returns ``[0, num_ones]``.
+    """
+    flat = attention_mask.flatten()
+    non_pad = flat > 0
+    non_pad_ids = flat[non_pad]
+    if len(non_pad_ids) == 0:
+        return torch.tensor([0], dtype=torch.int32, device=attention_mask.device)
+    boundaries = torch.where(non_pad_ids[1:] != non_pad_ids[:-1])[0] + 1
+    cu_seqlens = torch.zeros(len(boundaries) + 2, dtype=torch.int32, device=attention_mask.device)
+    cu_seqlens[1:-1] = boundaries
+    cu_seqlens[-1] = len(non_pad_ids)
+    return cu_seqlens
+
+
 class OlmoHybridGatedDeltaNet(nn.Module):
     """
     GatedDeltaNet linear attention for OLMo Hybrid.
@@ -542,9 +560,6 @@ def forward(
         attention_mask: torch.Tensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
-        # Requires LEFT padding to work correctly
-        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
-
         batch_size, seq_len, _ = hidden_states.shape
 
         use_cache = cache_params is not None
@@ -553,6 +568,21 @@ def forward(
         # below, each of which gates on `seq_len == 1` locally.
         use_precomputed = use_cache and cache_params.has_previous_state()
 
+        # For packed sequences (attention_mask with unique sequence IDs > 1), derive
+        # cu_seqlens and unpad so recurrent state doesn't leak across sequence boundaries.
+        # Requires the FLA fast path; torch fallbacks don't support cu_seqlens.
+        cu_seqlens = None
+        unpad_indices = None
+        if attention_mask is not None and not use_precomputed and is_fast_path_available and attention_mask.max() > 1:
+            cu_seqlens = _cu_seqlens_from_packed_mask(attention_mask)
+            unpad_indices = attention_mask.flatten() > 0
+            hidden_states = hidden_states[:, unpad_indices, :]
+        else:
+            # Requires LEFT padding to work correctly
+            hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+
+        effective_batch, effective_len, _ = hidden_states.shape
+
         conv_state_q = cache_params.conv_states_q[self.layer_idx] if cache_params else None
         conv_state_k = cache_params.conv_states_k[self.layer_idx] if cache_params else None
         conv_state_v = cache_params.conv_states_v[self.layer_idx] if cache_params else None
@@ -563,13 +593,13 @@ def forward(
         v = self.v_proj(hidden_states)
 
         q, new_conv_state_q = self.q_conv1d(
-            q, cache=conv_state_q, use_precomputed=use_precomputed, output_final_state=use_cache
+            q, cache=conv_state_q, use_precomputed=use_precomputed, output_final_state=use_cache, cu_seqlens=cu_seqlens
         )
         k, new_conv_state_k = self.k_conv1d(
-            k, cache=conv_state_k, use_precomputed=use_precomputed, output_final_state=use_cache
+            k, cache=conv_state_k, use_precomputed=use_precomputed, output_final_state=use_cache, cu_seqlens=cu_seqlens
         )
         v, new_conv_state_v = self.v_conv1d(
-            v, cache=conv_state_v, use_precomputed=use_precomputed, output_final_state=use_cache
+            v, cache=conv_state_v, use_precomputed=use_precomputed, output_final_state=use_cache, cu_seqlens=cu_seqlens
         )
 
         if cache_params is not None:
@@ -577,9 +607,9 @@ def forward(
             cache_params.conv_states_k[self.layer_idx] = new_conv_state_k
             cache_params.conv_states_v[self.layer_idx] = new_conv_state_v
 
-        q = q.view(batch_size, seq_len, -1, self.head_k_dim)
-        k = k.view(batch_size, seq_len, -1, self.head_k_dim)
-        v = v.view(batch_size, seq_len, -1, self.head_v_dim)
+        q = q.view(effective_batch, effective_len, -1, self.head_k_dim)
+        k = k.view(effective_batch, effective_len, -1, self.head_k_dim)
+        v = v.view(effective_batch, effective_len, -1, self.head_v_dim)
 
         if self.num_v_heads > self.num_k_heads:
             expand_ratio = self.num_v_heads // self.num_k_heads
@@ -604,6 +634,7 @@ def forward(
                 use_qk_l2norm_in_kernel=True,
             )
         else:
+            chunk_extra_kwargs = {"cu_seqlens": cu_seqlens} if cu_seqlens is not None else {}
             output, new_recurrent_state = self.chunk_gated_delta_rule(
                 q,
                 k,
@@ -613,6 +644,7 @@ def forward(
                 initial_state=recurrent_state if use_precomputed else None,
                 output_final_state=use_cache,
                 use_qk_l2norm_in_kernel=True,
+                **chunk_extra_kwargs,
             )
 
         if cache_params is not None:
@@ -622,10 +654,16 @@ def forward(
         output = output.reshape(-1, self.head_v_dim)
         gate = gate.reshape(-1, self.head_v_dim)
         output = self.o_norm(output, gate)
-        output = output.reshape(batch_size, seq_len, -1)
+        output = output.reshape(effective_batch, effective_len, -1)
 
         output = self.o_proj(output)
 
+        # Re-pad output to original shape for packed sequences
+        if unpad_indices is not None:
+            output_padded = output.new_zeros(batch_size, seq_len, output.shape[-1])
+            output_padded[:, unpad_indices, :] = output
+            output = output_padded
+
         return output
 
 
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index 5d89ec741529..28e233f8991c 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -123,7 +123,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -555,7 +555,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -563,7 +563,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -580,8 +582,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
index 6c154978cedb..915d77033e3c 100644
--- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
@@ -33,6 +33,7 @@
 
 if TYPE_CHECKING:
     from .modeling_omdet_turbo import OmDetTurboObjectDetectionOutput
+from ..detr.image_processing_detr import DetrImageProcessorKwargs
 
 
 class OmDetTurboTextKwargs(TextKwargs, total=False):
@@ -55,6 +56,7 @@ class OmDetTurboTextKwargs(TextKwargs, total=False):
 
 
 class OmDetTurboProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: DetrImageProcessorKwargs
     text_kwargs: OmDetTurboTextKwargs
     _defaults = {
         "text_kwargs": {
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index ccee798bc39e..c7dc8399b6f2 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -43,7 +43,7 @@
 
 try:
     from huggingface_hub import hf_hub_download
-    from huggingface_hub.utils import RepositoryNotFoundError
+    from huggingface_hub.errors import RepositoryNotFoundError
 except ImportError:
     hf_hub_download = None
     RepositoryNotFoundError = None
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index 57214df16d82..f84c335bf13d 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -3120,6 +3120,13 @@ def forward(
         '👉 Panoptic Predictions Shape: [512, 683]'
         ```
         """
+        if mask_labels is not None:
+            target_device = pixel_values.device
+            mask_labels = [mask.to(target_device) for mask in mask_labels]
+
+        if class_labels is not None:
+            target_device = pixel_values.device
+            class_labels = [label.to(target_device) for label in class_labels]
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/ovis2/modeling_ovis2.py b/src/transformers/models/ovis2/modeling_ovis2.py
index 4b389e4c66b7..4396e32cd2fb 100644
--- a/src/transformers/models/ovis2/modeling_ovis2.py
+++ b/src/transformers/models/ovis2/modeling_ovis2.py
@@ -531,9 +531,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
@@ -581,11 +581,7 @@ def forward(
                     mask = (input_ids == visual_indicator_id).to(inputs_embeds.device)
 
                 if mask.any():
-                    inputs_embeds[mask] = (
-                        visual_indicator_features[i]
-                        .expand_as(inputs_embeds[mask])
-                        .to(inputs_embeds.device, inputs_embeds.dtype)
-                    )
+                    inputs_embeds[mask] = visual_indicator_features[i].to(inputs_embeds.device, inputs_embeds.dtype)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/ovis2/modular_ovis2.py b/src/transformers/models/ovis2/modular_ovis2.py
index 74c1aa66b7ce..8790edf6b9a6 100644
--- a/src/transformers/models/ovis2/modular_ovis2.py
+++ b/src/transformers/models/ovis2/modular_ovis2.py
@@ -332,11 +332,7 @@ def forward(
                     mask = (input_ids == visual_indicator_id).to(inputs_embeds.device)
 
                 if mask.any():
-                    inputs_embeds[mask] = (
-                        visual_indicator_features[i]
-                        .expand_as(inputs_embeds[mask])
-                        .to(inputs_embeds.device, inputs_embeds.dtype)
-                    )
+                    inputs_embeds[mask] = visual_indicator_features[i].to(inputs_embeds.device, inputs_embeds.dtype)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/ovis2/processing_ovis2.py b/src/transformers/models/ovis2/processing_ovis2.py
index acebbb4b2f84..9f60255c9ca5 100644
--- a/src/transformers/models/ovis2/processing_ovis2.py
+++ b/src/transformers/models/ovis2/processing_ovis2.py
@@ -18,12 +18,14 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
+from .image_processing_ovis2 import Ovis2ImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class Ovis2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Ovis2ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py
index 0ae254feef39..a243e1738c83 100644
--- a/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py
+++ b/src/transformers/models/paddleocr_vl/modeling_paddleocr_vl.py
@@ -692,7 +692,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention 2: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, attn_weights = attention_interface(
                 self,
                 query_states,
@@ -1264,10 +1264,10 @@ def get_placeholder_mask(
             special_image_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         n_image_features = image_features.shape[0] * image_features.shape[1]
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
index 02895d6e2576..19c1c264d4c9 100644
--- a/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
+++ b/src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py
@@ -1014,10 +1014,10 @@ def get_placeholder_mask(
             special_image_mask = input_ids == self.config.image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         n_image_features = image_features.shape[0] * image_features.shape[1]
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/paddleocr_vl/processing_paddleocr_vl.py b/src/transformers/models/paddleocr_vl/processing_paddleocr_vl.py
index 5a71289e0188..0e0f93733f63 100644
--- a/src/transformers/models/paddleocr_vl/processing_paddleocr_vl.py
+++ b/src/transformers/models/paddleocr_vl/processing_paddleocr_vl.py
@@ -27,9 +27,11 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from .image_processing_paddleocr_vl import PaddleOCRVLImageProcessorKwargs
 
 
 class PaddleOCRVLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: PaddleOCRVLImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 6eeeaa6bd681..d950a327969c 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -160,10 +160,8 @@ def create_causal_mask_mapping(
     # from `forward` call. If users run a `forward` call, we have no option to infer `is_first_iteration` because users may be
     # running generation with custom loop. Thus we need to infer it in a `non-perfect` way
     # NOTE: Determining prefill in that case requires checking data values, which is not compile-compatible.
-    is_first_iteration = (
-        is_first_iteration
-        if is_first_iteration
-        else (past_key_values is None or not past_key_values.is_initialized or pixel_values is not None)
+    is_first_iteration = is_first_iteration or (
+        past_key_values is None or not past_key_values.is_initialized or pixel_values is not None
     )
 
     if is_first_iteration or not kwargs.get("use_cache", True):
@@ -273,9 +271,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/parakeet/feature_extraction_parakeet.py b/src/transformers/models/parakeet/feature_extraction_parakeet.py
index c745d02c9629..95289cc00d99 100644
--- a/src/transformers/models/parakeet/feature_extraction_parakeet.py
+++ b/src/transformers/models/parakeet/feature_extraction_parakeet.py
@@ -217,17 +217,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/pe_audio/configuration_pe_audio.py b/src/transformers/models/pe_audio/configuration_pe_audio.py
index c7555d15b20c..2d1a81d08d74 100644
--- a/src/transformers/models/pe_audio/configuration_pe_audio.py
+++ b/src/transformers/models/pe_audio/configuration_pe_audio.py
@@ -118,6 +118,7 @@ class PeAudioConfig(PreTrainedConfig):
 
     text_config: dict | PreTrainedConfig | None = None
     audio_config: dict | PreTrainedConfig | None = None
+    tie_word_embeddings: bool = True
 
     def __post_init__(self, **kwargs):
         if isinstance(self.text_config, dict):
diff --git a/src/transformers/models/pe_audio/modeling_pe_audio.py b/src/transformers/models/pe_audio/modeling_pe_audio.py
index 5597b101dd81..0d4541a03558 100644
--- a/src/transformers/models/pe_audio/modeling_pe_audio.py
+++ b/src/transformers/models/pe_audio/modeling_pe_audio.py
@@ -601,7 +601,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py
index f61b59b9c53c..7f7e5e57dc5b 100644
--- a/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py
+++ b/src/transformers/models/pe_audio_video/modeling_pe_audio_video.py
@@ -505,7 +505,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/pe_video/configuration_pe_video.py b/src/transformers/models/pe_video/configuration_pe_video.py
index d6e260d73389..b48b72daab5e 100644
--- a/src/transformers/models/pe_video/configuration_pe_video.py
+++ b/src/transformers/models/pe_video/configuration_pe_video.py
@@ -120,6 +120,7 @@ class PeVideoConfig(PreTrainedConfig):
 
     text_config: dict | PreTrainedConfig | None = None
     video_config: dict | PreTrainedConfig | None = None
+    tie_word_embeddings: bool = True
 
     def __post_init__(self, **kwargs):
         if isinstance(self.text_config, dict):
diff --git a/src/transformers/models/pe_video/modeling_pe_video.py b/src/transformers/models/pe_video/modeling_pe_video.py
index 5a712aa06028..436657bb76c9 100644
--- a/src/transformers/models/pe_video/modeling_pe_video.py
+++ b/src/transformers/models/pe_video/modeling_pe_video.py
@@ -485,7 +485,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/perception_lm/modeling_perception_lm.py b/src/transformers/models/perception_lm/modeling_perception_lm.py
index 95982fe86532..958e6d2fc041 100644
--- a/src/transformers/models/perception_lm/modeling_perception_lm.py
+++ b/src/transformers/models/perception_lm/modeling_perception_lm.py
@@ -220,18 +220,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.size()[:-1].numel()}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.size()[:-1].numel()}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/perception_lm/modular_perception_lm.py b/src/transformers/models/perception_lm/modular_perception_lm.py
index 4c09a6d22a78..89f09232c296 100644
--- a/src/transformers/models/perception_lm/modular_perception_lm.py
+++ b/src/transformers/models/perception_lm/modular_perception_lm.py
@@ -188,18 +188,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.size()[:-1].numel()}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.size()[:-1].numel()}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 56633fcc3856..f9b88bc52c13 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -22,12 +22,14 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ...video_utils import VideoInput
+from .image_processing_perception_lm_fast import PerceptionLMImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: PerceptionLMImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index e0516ed7da9a..800de6f101e1 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -114,7 +114,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index e3f97a01ee4c..cd904be742e6 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -89,7 +89,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index b07735f8a2e6..cd6add0a28ea 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -123,7 +123,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/phi3/modular_phi3.py b/src/transformers/models/phi3/modular_phi3.py
index 4229981cc0a8..4ec6d3c3c6dc 100644
--- a/src/transformers/models/phi3/modular_phi3.py
+++ b/src/transformers/models/phi3/modular_phi3.py
@@ -127,10 +127,19 @@ def forward(
         hidden_shape = (*input_shape, -1, self.head_dim)
 
         qkv = self.qkv_proj(hidden_states)
-        query_pos = self.config.num_attention_heads * self.head_dim
+
+        tp_degree = (
+            self.qkv_proj.weight.device_mesh.size(0)
+            if isinstance(self.qkv_proj.weight, torch.distributed.tensor.DTensor)
+            else 1
+        )
+        tp_sharded_attn_heads = self.config.num_attention_heads // tp_degree
+        tp_sharded_kv_heads = self.num_key_value_heads // tp_degree
+
+        query_pos = tp_sharded_attn_heads * self.head_dim
         query_states = qkv[..., :query_pos]
-        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
-        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+        key_states = qkv[..., query_pos : query_pos + tp_sharded_kv_heads * self.head_dim]
+        value_states = qkv[..., query_pos + tp_sharded_kv_heads * self.head_dim :]
 
         query_states = query_states.view(hidden_shape).transpose(1, 2)
         key_states = key_states.view(hidden_shape).transpose(1, 2)
diff --git a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
index 9ce98251e50e..3c3c1723a35a 100644
--- a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
@@ -145,17 +145,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
index 9e6c0339098d..24f0672bc65d 100644
--- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py
@@ -1475,7 +1475,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
index 325b27ed361c..dfef3c556d4d 100644
--- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
@@ -24,12 +24,14 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import TextInput
 from ...utils import auto_docstring, logging
+from .image_processing_phi4_multimodal_fast import Phi4MultimodalImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class Phi4MultimodalProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Phi4MultimodalImageProcessorKwargs
     _defaults = {
         "audio_kwargs": {
             "device": "cpu",
diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py
index 23bc944c522a..c2fcca077860 100644
--- a/src/transformers/models/phimoe/modeling_phimoe.py
+++ b/src/transformers/models/phimoe/modeling_phimoe.py
@@ -723,7 +723,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -731,7 +731,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -748,8 +750,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/pi0/modeling_pi0.py b/src/transformers/models/pi0/modeling_pi0.py
index 8fd8abe48d7b..b023015b8d89 100644
--- a/src/transformers/models/pi0/modeling_pi0.py
+++ b/src/transformers/models/pi0/modeling_pi0.py
@@ -140,10 +140,7 @@ def embed_prefix(self, input_ids, pixel_values, pixel_attention_mask, attention_
         llm_input_ids[input_ids == self.config.vlm_config.image_token_id] = 0
         inputs_embeds = self.vlm.get_input_embeddings()(llm_input_ids)
         special_image_mask = (
-            (input_ids == self.config.vlm_config.image_token_id)
-            .unsqueeze(-1)
-            .expand_as(inputs_embeds)
-            .to(inputs_embeds.device)
+            (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).to(inputs_embeds.device)
         )
         inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, total_image_features)
 
diff --git a/src/transformers/models/pi0/modular_pi0.py b/src/transformers/models/pi0/modular_pi0.py
index f79ac3c2775a..96c6e7aca594 100644
--- a/src/transformers/models/pi0/modular_pi0.py
+++ b/src/transformers/models/pi0/modular_pi0.py
@@ -390,10 +390,7 @@ def embed_prefix(self, input_ids, pixel_values, pixel_attention_mask, attention_
         llm_input_ids[input_ids == self.config.vlm_config.image_token_id] = 0
         inputs_embeds = self.vlm.get_input_embeddings()(llm_input_ids)
         special_image_mask = (
-            (input_ids == self.config.vlm_config.image_token_id)
-            .unsqueeze(-1)
-            .expand_as(inputs_embeds)
-            .to(inputs_embeds.device)
+            (input_ids == self.config.vlm_config.image_token_id).unsqueeze(-1).to(inputs_embeds.device)
         )
         inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, total_image_features)
 
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index 189c539daaf0..bef18d6566f8 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -19,9 +19,11 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
+from .image_processing_pix2struct import Pix2StructImageProcessorKwargs
 
 
 class Pix2StructProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Pix2StructImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index eef1dc674e7b..5f7024a21da0 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -32,12 +32,14 @@
 
 if is_vision_available():
     from .image_processing_pixtral import get_resize_output_image_size
+from .image_processing_pixtral import PixtralImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class PixtralProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: PixtralImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 86089cd98914..c6f95cbb0f9c 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -1442,6 +1442,12 @@ def forward(
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
+        elif return_dict and isinstance(encoder_outputs, tuple):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
 
         # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
         decoder_outputs = self.decoder(
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 9263e1d42937..b778359a284a 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -105,7 +105,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
index c8824b2f9730..5f8d0464e684 100644
--- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py
@@ -616,7 +616,7 @@ def forward(
         query_states = query_states.transpose(0, 1).unsqueeze(0)
         key_states = key_states.transpose(0, 1).unsqueeze(0)
         value_states = value_states.transpose(0, 1).unsqueeze(0)
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
 
         attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
             self.config._attn_implementation, eager_attention_forward
@@ -946,7 +946,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention 2: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1818,22 +1818,22 @@ def get_placeholder_mask(
             special_audio_mask = input_ids == self.config.audio_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
 
-        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_audio_mask = special_audio_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask, special_video_mask, special_audio_mask
 
     @can_return_tuple
@@ -2488,7 +2488,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -3858,7 +3858,7 @@ def generate(
         embeds_to_talker = thinker_result.hidden_states[0][0].clone().to(input_ids.device)
         if thinker_kwargs.get("input_features") is not None:
             audio_ids_mask = input_ids == self.config.thinker_config.audio_token_index
-            audio_mask = audio_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
+            audio_mask = audio_ids_mask.unsqueeze(-1)
             audio_mask_tensor = torch.zeros(
                 [audio_ids_mask.sum(), embeds_to_talker.shape[-1]],
                 dtype=embeds_to_talker.dtype,
@@ -3867,7 +3867,7 @@ def generate(
             embeds_to_talker.masked_scatter_(audio_mask, audio_mask_tensor)
         if thinker_kwargs.get("pixel_values") is not None:
             image_ids_mask = input_ids == self.config.thinker_config.image_token_index
-            image_mask = image_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
+            image_mask = image_ids_mask.unsqueeze(-1)
             image_mask_tensor = torch.zeros(
                 [image_ids_mask.sum(), embeds_to_talker.shape[-1]],
                 dtype=embeds_to_talker.dtype,
@@ -3876,7 +3876,7 @@ def generate(
             embeds_to_talker.masked_scatter_(image_mask, image_mask_tensor)
         if thinker_kwargs.get("pixel_values_videos") is not None:
             video_ids_mask = input_ids == self.config.thinker_config.video_token_index
-            video_mask = video_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
+            video_mask = video_ids_mask.unsqueeze(-1)
             video_mask_tensor = torch.zeros(
                 [video_ids_mask.sum(), embeds_to_talker.shape[-1]],
                 dtype=embeds_to_talker.dtype,
diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
index 4618b08cd574..9ff3cc12323a 100644
--- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py
@@ -1112,7 +1112,7 @@ def forward(
         query_states = query_states.transpose(0, 1).unsqueeze(0)
         key_states = key_states.transpose(0, 1).unsqueeze(0)
         value_states = value_states.transpose(0, 1).unsqueeze(0)
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
 
         attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
             self.config._attn_implementation, eager_attention_forward
@@ -1421,7 +1421,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention 2: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1755,22 +1755,22 @@ def get_placeholder_mask(
             special_audio_mask = input_ids == self.config.audio_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
 
-        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_audio_mask = special_audio_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask, special_video_mask, special_audio_mask
 
     @can_return_tuple
@@ -3693,7 +3693,7 @@ def generate(
         embeds_to_talker = thinker_result.hidden_states[0][0].clone().to(input_ids.device)
         if thinker_kwargs.get("input_features") is not None:
             audio_ids_mask = input_ids == self.config.thinker_config.audio_token_index
-            audio_mask = audio_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
+            audio_mask = audio_ids_mask.unsqueeze(-1)
             audio_mask_tensor = torch.zeros(
                 [audio_ids_mask.sum(), embeds_to_talker.shape[-1]],
                 dtype=embeds_to_talker.dtype,
@@ -3702,7 +3702,7 @@ def generate(
             embeds_to_talker.masked_scatter_(audio_mask, audio_mask_tensor)
         if thinker_kwargs.get("pixel_values") is not None:
             image_ids_mask = input_ids == self.config.thinker_config.image_token_index
-            image_mask = image_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
+            image_mask = image_ids_mask.unsqueeze(-1)
             image_mask_tensor = torch.zeros(
                 [image_ids_mask.sum(), embeds_to_talker.shape[-1]],
                 dtype=embeds_to_talker.dtype,
@@ -3711,7 +3711,7 @@ def generate(
             embeds_to_talker.masked_scatter_(image_mask, image_mask_tensor)
         if thinker_kwargs.get("pixel_values_videos") is not None:
             video_ids_mask = input_ids == self.config.thinker_config.video_token_index
-            video_mask = video_ids_mask.unsqueeze(-1).expand_as(embeds_to_talker)
+            video_mask = video_ids_mask.unsqueeze(-1)
             video_mask_tensor = torch.zeros(
                 [video_ids_mask.sum(), embeds_to_talker.shape[-1]],
                 dtype=embeds_to_talker.dtype,
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index 5f5b6584862a..37ea59c03d74 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -26,6 +26,7 @@
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
 from ...utils import auto_docstring
 from ...video_utils import VideoInput
+from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs
 
 
 # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
@@ -77,6 +78,7 @@ class Qwen2_5_OmniVideosKwargs(VideosKwargs, total=False):
 
 
 class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen2VLImageProcessorKwargs
     videos_kwargs: Qwen2_5_OmniVideosKwargs
 
     _defaults = {
diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 9e2812720d4c..c51ef1cd43b0 100644
--- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -243,7 +243,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1208,18 +1208,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
index 8873eb82557a..a50ad8c0362d 100644
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -28,9 +28,11 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
 from ...video_utils import VideoInput
+from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs
 
 
 class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen2VLImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
index 442eab1edcd4..ed81597c3f26 100644
--- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
@@ -324,9 +324,23 @@ def forward(
     ):
         r"""
         Args:
-            attention_mask (`torch.Tensor`)`, *optional*):
-                Qwen2Audio does not support masking of the `input_features`, this argument is preserved for compatibility,
-                but it is not used. By default the silence in the input log mel spectrogram are ignored.
+            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
+                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
+                the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`), *optional*):
+                attention mask used in the encoder stack (after the convolutional layers).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
 
         expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
@@ -702,7 +716,7 @@ def forward(
                     feature_attention_mask.sum(-1)
                 )
                 batch_size, _, max_mel_seq_len = input_features.shape
-                max_seq_len = (max_mel_seq_len - 2) // 2 + 1
+                max_seq_len = (max_mel_seq_len - 1) // 2 + 1
                 # Create a sequence tensor of shape (batch_size, max_seq_len)
                 seq_range = (
                     torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device)
@@ -754,7 +768,7 @@ def forward(
                         f"Audio features and audio tokens do not match, tokens: {n_audio_tokens}, features: {n_audio_features}",
                     )
                     special_audio_mask = (input_ids == self.config.audio_token_id).to(inputs_embeds.device)
-                    special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds)
+                    special_audio_mask = special_audio_mask.unsqueeze(-1)
                     audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
                     inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
 
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index d4150d0a74d7..a8ced95cf687 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -135,7 +135,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -568,7 +568,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -576,7 +576,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -593,8 +595,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 7ea940df2ae0..2eac76c79c66 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -414,7 +414,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1162,18 +1162,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
index 9c38451e60e8..7335362b931d 100644
--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@@ -26,12 +26,14 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ...video_utils import VideoInput
+from .image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen2VLImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/qwen3/modeling_qwen3.py b/src/transformers/models/qwen3/modeling_qwen3.py
index 91715a33cf9d..b6851ec49c6f 100644
--- a/src/transformers/models/qwen3/modeling_qwen3.py
+++ b/src/transformers/models/qwen3/modeling_qwen3.py
@@ -140,7 +140,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/qwen3_5/modeling_qwen3_5.py b/src/transformers/models/qwen3_5/modeling_qwen3_5.py
index bad700952673..693537a40d9e 100644
--- a/src/transformers/models/qwen3_5/modeling_qwen3_5.py
+++ b/src/transformers/models/qwen3_5/modeling_qwen3_5.py
@@ -40,6 +40,7 @@
     BaseModelOutputWithPooling,
     CausalLMOutputWithPast,
     ModelOutput,
+    SequenceClassifierOutputWithPast,
 )
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@@ -241,6 +242,7 @@ def torch_chunk_gated_delta_rule(
     initial_state=None,
     output_final_state=False,
     use_qk_l2norm_in_kernel=False,
+    **kwargs,
 ):
     initial_dtype = query.dtype
     if use_qk_l2norm_in_kernel:
@@ -426,6 +428,8 @@ def forward(
         hidden_states: torch.Tensor,
         cache_params: Cache | None = None,
         attention_mask: torch.Tensor | None = None,
+        seq_idx: torch.IntTensor | None = None,
+        cu_seqlens: torch.LongTensor | None = None,
     ):
         hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
 
@@ -477,7 +481,7 @@ def forward(
                     weight=self.conv1d.weight.squeeze(1),
                     bias=self.conv1d.bias,
                     activation=self.activation,
-                    seq_idx=None,
+                    seq_idx=seq_idx,
                 )
             else:
                 mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, : mixed_qkv.shape[-1]])
@@ -528,6 +532,7 @@ def forward(
                 initial_state=recurrent_state if use_precomputed_states else None,
                 output_final_state=cache_params is not None,
                 use_qk_l2norm_in_kernel=True,
+                cu_seqlens=cu_seqlens,
             )
 
         # Update cache
@@ -771,6 +776,9 @@ def forward(
                 hidden_states=hidden_states,
                 cache_params=past_key_values,
                 attention_mask=attention_mask,
+                seq_idx=kwargs.get("seq_idx"),
+                # The chunked FLA kernel takes a single `cu_seqlens` arg; for packed self-attention this matches q-side lengths.
+                cu_seqlens=kwargs.get("cu_seq_lens_q"),
             )
         elif self.layer_type == "full_attention":
             # Self Attention
@@ -925,7 +933,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1556,18 +1564,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
@@ -1778,10 +1786,6 @@ def forward(
         )
 
 
-class Qwen3_5ForSequenceClassification(GenericForSequenceClassification, Qwen3_5PreTrainedModel):
-    config: Qwen3_5TextConfig
-
-
 @dataclass
 @auto_docstring(
     custom_intro="""
@@ -2183,11 +2187,47 @@ def _expand_dict_for_generation(dict_to_expand):
         return input_ids, model_kwargs
 
 
+class Qwen3_5TextForSequenceClassification(GenericForSequenceClassification, Qwen3_5PreTrainedModel):
+    config: Qwen3_5TextConfig
+    input_modalities = ("text",)
+
+
+class Qwen3_5ForSequenceClassification(GenericForSequenceClassification, Qwen3_5PreTrainedModel):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_values_videos: torch.FloatTensor | None = None,
+        image_grid_thw: torch.LongTensor | None = None,
+        video_grid_thw: torch.LongTensor | None = None,
+        mm_token_type_ids: torch.IntTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> SequenceClassifierOutputWithPast:
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            mm_token_type_ids=mm_token_type_ids,
+            **kwargs,
+        )
+
+
 __all__ = [
     "Qwen3_5VisionModel",
     "Qwen3_5TextModel",
     "Qwen3_5Model",
     "Qwen3_5ForCausalLM",
+    "Qwen3_5TextForSequenceClassification",
     "Qwen3_5ForSequenceClassification",
     "Qwen3_5ForConditionalGeneration",
     "Qwen3_5PreTrainedModel",
diff --git a/src/transformers/models/qwen3_5/modular_qwen3_5.py b/src/transformers/models/qwen3_5/modular_qwen3_5.py
index 710b63a28dba..bcdc5d2885bb 100644
--- a/src/transformers/models/qwen3_5/modular_qwen3_5.py
+++ b/src/transformers/models/qwen3_5/modular_qwen3_5.py
@@ -24,7 +24,7 @@
 from ...cache_utils import Cache, DynamicCache
 from ...masking_utils import create_causal_mask
 from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling
+from ...modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
@@ -209,6 +209,8 @@ def forward(
         hidden_states: torch.Tensor,
         cache_params: Cache | None = None,
         attention_mask: torch.Tensor | None = None,
+        seq_idx: torch.IntTensor | None = None,
+        cu_seqlens: torch.LongTensor | None = None,
     ):
         hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
 
@@ -260,7 +262,7 @@ def forward(
                     weight=self.conv1d.weight.squeeze(1),
                     bias=self.conv1d.bias,
                     activation=self.activation,
-                    seq_idx=None,
+                    seq_idx=seq_idx,
                 )
             else:
                 mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, : mixed_qkv.shape[-1]])
@@ -311,6 +313,7 @@ def forward(
                 initial_state=recurrent_state if use_precomputed_states else None,
                 output_final_state=cache_params is not None,
                 use_qk_l2norm_in_kernel=True,
+                cu_seqlens=cu_seqlens,
             )
 
         # Update cache
@@ -373,6 +376,9 @@ def forward(
                 hidden_states=hidden_states,
                 cache_params=past_key_values,
                 attention_mask=attention_mask,
+                seq_idx=kwargs.get("seq_idx"),
+                # The chunked FLA kernel takes a single `cu_seqlens` arg; for packed self-attention this matches q-side lengths.
+                cu_seqlens=kwargs.get("cu_seq_lens_q"),
             )
         elif self.layer_type == "full_attention":
             # Self Attention
@@ -668,10 +674,6 @@ def __init__(self, config):
         self.model = Qwen3_5TextModel(config)
 
 
-class Qwen3_5ForSequenceClassification(GenericForSequenceClassification, Qwen3_5PreTrainedModel):
-    config: Qwen3_5TextConfig
-
-
 class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration):
     def get_video_features(
         self,
@@ -686,6 +688,41 @@ def get_image_features(
         return super().get_image_features(**super_kwargs)
 
 
+class Qwen3_5TextForSequenceClassification(GenericForSequenceClassification, Qwen3_5PreTrainedModel):
+    config: Qwen3_5TextConfig
+    input_modalities = ("text",)
+
+
+class Qwen3_5ForSequenceClassification(GenericForSequenceClassification, Qwen3_5PreTrainedModel):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_values_videos: torch.FloatTensor | None = None,
+        image_grid_thw: torch.LongTensor | None = None,
+        video_grid_thw: torch.LongTensor | None = None,
+        mm_token_type_ids: torch.IntTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> SequenceClassifierOutputWithPast:
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            mm_token_type_ids=mm_token_type_ids,
+            **kwargs,
+        )
+
+
 __all__ = [
     "Qwen3_5Config",
     "Qwen3_5TextConfig",
@@ -694,6 +731,7 @@ def get_image_features(
     "Qwen3_5TextModel",
     "Qwen3_5Model",
     "Qwen3_5ForCausalLM",
+    "Qwen3_5TextForSequenceClassification",
     "Qwen3_5ForSequenceClassification",
     "Qwen3_5ForConditionalGeneration",
     "Qwen3_5PreTrainedModel",
diff --git a/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py
index f6f9594e0d73..65513958c4ef 100644
--- a/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py
+++ b/src/transformers/models/qwen3_5_moe/configuration_qwen3_5_moe.py
@@ -129,6 +129,8 @@ class Qwen3_5MoeVisionConfig(PreTrainedConfig):
         The output hidden size of the vision model.
     num_position_embeddings (`int`, *optional*, defaults to 2304):
         The maximum sequence length that this model might ever be used with
+    deepstack_visual_indexes (`list[int]`, *optional*, defaults to `[]`):
+        Indexes of layers for deepstack embeddings.
     """
 
     model_type = "qwen3_5_moe_vision"
@@ -145,6 +147,7 @@ class Qwen3_5MoeVisionConfig(PreTrainedConfig):
     temporal_patch_size: int | list[int] | tuple[int, int] = 2
     out_hidden_size: int = 3584
     num_position_embeddings: int = 2304
+    deepstack_visual_indexes: list[int] | tuple[int, ...] = ()
     initializer_range: float = 0.02
 
 
diff --git a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py
index d7b45a276412..f4dac81e3885 100644
--- a/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py
+++ b/src/transformers/models/qwen3_5_moe/modeling_qwen3_5_moe.py
@@ -1018,7 +1018,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1681,18 +1681,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py b/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py
index 092643666aed..463a823cce82 100644
--- a/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py
+++ b/src/transformers/models/qwen3_5_moe/modular_qwen3_5_moe.py
@@ -119,7 +119,12 @@ def __post_init__(self, **kwargs):
 @auto_docstring(checkpoint="Qwen/Qwen3.5-35B-A3B")
 @strict
 class Qwen3_5MoeVisionConfig(Qwen3_5VisionConfig):
-    pass
+    r"""
+    deepstack_visual_indexes (`list[int]`, *optional*, defaults to `[]`):
+        Indexes of layers for deepstack embeddings.
+    """
+
+    deepstack_visual_indexes: list[int] | tuple[int, ...] = ()
 
 
 @auto_docstring(checkpoint="Qwen/Qwen3.5-35B-A3B")
diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
index ddf84fc575b7..db7fb0e1c147 100644
--- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
@@ -440,7 +440,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -560,7 +560,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -568,13 +568,17 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
-        batch_size, sequence_length = attention_mask.shape
-        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        num_hidden_layers = len(gate_logits)
+        batch_size = attention_mask.shape[0]
+        sequence_length = gate_logits[0].shape[0] // batch_size
+        attention_mask = attention_mask[:, -sequence_length:]
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
         expert_attention_mask = (
@@ -585,8 +589,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
@@ -688,7 +694,7 @@ def forward(
             loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
 
         aux_loss = None
-        if output_router_logits:
+        if output_router_logits and self.training and outputs.router_logits:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,
diff --git a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
index 0fd5b451959c..2151d2bd16a6 100644
--- a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
+++ b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py
@@ -162,7 +162,7 @@ def forward(
             loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
 
         aux_loss = None
-        if output_router_logits:
+        if output_router_logits and self.training and outputs.router_logits:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,
diff --git a/src/transformers/models/qwen3_next/modeling_qwen3_next.py b/src/transformers/models/qwen3_next/modeling_qwen3_next.py
index 395f13d1420c..f35e00cc8219 100644
--- a/src/transformers/models/qwen3_next/modeling_qwen3_next.py
+++ b/src/transformers/models/qwen3_next/modeling_qwen3_next.py
@@ -141,7 +141,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -381,6 +381,7 @@ def torch_chunk_gated_delta_rule(
     initial_state=None,
     output_final_state=False,
     use_qk_l2norm_in_kernel=False,
+    **kwargs,
 ):
     initial_dtype = query.dtype
     if use_qk_l2norm_in_kernel:
@@ -1034,7 +1035,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -1042,13 +1043,17 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
-        batch_size, sequence_length = attention_mask.shape
-        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        num_hidden_layers = len(gate_logits)
+        batch_size = attention_mask.shape[0]
+        sequence_length = gate_logits[0].shape[0] // batch_size
+        attention_mask = attention_mask[:, -sequence_length:]
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
         expert_attention_mask = (
@@ -1059,8 +1064,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
diff --git a/src/transformers/models/qwen3_next/modular_qwen3_next.py b/src/transformers/models/qwen3_next/modular_qwen3_next.py
index 0bb527288bb9..6128b011f8fd 100644
--- a/src/transformers/models/qwen3_next/modular_qwen3_next.py
+++ b/src/transformers/models/qwen3_next/modular_qwen3_next.py
@@ -220,6 +220,7 @@ def torch_chunk_gated_delta_rule(
     initial_state=None,
     output_final_state=False,
     use_qk_l2norm_in_kernel=False,
+    **kwargs,
 ):
     initial_dtype = query.dtype
     if use_qk_l2norm_in_kernel:
diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
index 7b6c8b5b1bd4..493700492ce9 100644
--- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py
@@ -537,7 +537,7 @@ def forward(
         query_states = query_states.transpose(0, 1).unsqueeze(0)
         key_states = key_states.transpose(0, 1).unsqueeze(0)
         value_states = value_states.transpose(0, 1).unsqueeze(0)
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
 
         attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
             self.config._attn_implementation, eager_attention_forward
@@ -873,7 +873,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -1207,10 +1207,11 @@ def forward(
         Returns:
             `torch.Tensor`: hidden_states.
         """
+        input_dtype = hidden_states.dtype
         hidden_states = self.patch_embed(hidden_states)
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
-        hidden_states = hidden_states + pos_embeds
+        hidden_states = (hidden_states + pos_embeds).to(input_dtype)
 
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
 
@@ -1839,7 +1840,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -1847,7 +1848,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -1864,8 +1867,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
@@ -2022,22 +2027,22 @@ def get_placeholder_mask(
             special_audio_mask = input_ids == self.config.audio_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
 
-        special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_audio_mask = special_audio_mask.unsqueeze(-1).to(inputs_embeds.device)
         return special_image_mask, special_video_mask, special_audio_mask
 
     @can_return_tuple
@@ -2238,7 +2243,7 @@ def forward(
             )
 
         aux_loss = None
-        if output_router_logits:
+        if output_router_logits and self.training:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,
@@ -2538,7 +2543,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
@@ -3149,7 +3154,7 @@ def forward(
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
         aux_loss = None
-        if output_router_logits:
+        if output_router_logits and self.training:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,
diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
index 2c78ad930eba..74eabeec794d 100644
--- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py
@@ -1308,7 +1308,7 @@ def forward(
             )
 
         aux_loss = None
-        if output_router_logits:
+        if output_router_logits and self.training:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,
@@ -1741,7 +1741,7 @@ def forward(
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
 
         aux_loss = None
-        if output_router_logits:
+        if output_router_logits and self.training:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.num_experts,
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index f8fa23ee31ba..be6e8ba00718 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -29,6 +29,7 @@
 from ...tokenization_utils_base import TextInput
 from ...utils import auto_docstring
 from ...video_utils import VideoInput, make_batched_videos
+from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs
 
 
 # Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
@@ -80,6 +81,7 @@ class Qwen3OmniMoeVideosKwargs(VideosKwargs, total=False):
 
 
 class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen2VLImageProcessorKwargs
     videos_kwargs: Qwen3OmniMoeVideosKwargs
     _defaults = {
         "text_kwargs": {
diff --git a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py
index 9522cb354789..e0487c188734 100644
--- a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@@ -218,7 +218,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -777,10 +777,11 @@ def forward(
         Returns:
             `torch.Tensor`: hidden_states.
         """
+        input_dtype = hidden_states.dtype
         hidden_states = self.patch_embed(hidden_states)
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
-        hidden_states = hidden_states + pos_embeds
+        hidden_states = (hidden_states + pos_embeds).to(input_dtype)
 
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
 
@@ -1190,18 +1191,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
index e2b1dd42a68b..4c9f1d9c6632 100644
--- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
@@ -563,10 +563,11 @@ def forward(
         Returns:
             `torch.Tensor`: hidden_states.
         """
+        input_dtype = hidden_states.dtype
         hidden_states = self.patch_embed(hidden_states)
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
-        hidden_states = hidden_states + pos_embeds
+        hidden_states = (hidden_states + pos_embeds).to(input_dtype)
 
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
 
diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
index 1ca435749ad2..70834bcf00dd 100644
--- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
@@ -26,12 +26,14 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ...video_utils import VideoInput
+from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen2VLImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
@@ -102,8 +104,32 @@ def __call__(
             **kwargs,
         )
         if images is not None:
-            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-            image_grid_thw = image_inputs["image_grid_thw"]
+            # Preserve per-sample image grouping when a nested list of images is provided
+            if isinstance(images, (list, tuple)) and len(images) > 0 and isinstance(images[0], (list, tuple)):
+                per_sample_inputs = [
+                    self.image_processor(images=imgs, **output_kwargs["images_kwargs"]) for imgs in images
+                ]
+                per_sample_pixel_values = [ps["pixel_values"] for ps in per_sample_inputs]
+                # Concatenate image_grid_thw across samples for compatibility with text token placeholder logic
+                image_grid_thw = []
+                for ps in per_sample_inputs:
+                    image_grid_thw.extend(ps.get("image_grid_thw", []))
+
+                # Zero-pad along image dimension to the max number of images in the batch, then stack batch-first
+                max_n = max(p.shape[0] for p in per_sample_pixel_values) if len(per_sample_pixel_values) > 0 else 0
+                padded = []
+                for p in per_sample_pixel_values:
+                    if p.shape[0] < max_n:
+                        pad_shape = (max_n - p.shape[0],) + p.shape[1:]
+                        pad = np.zeros(pad_shape, dtype=p.dtype)
+                        p = np.concatenate([p, pad], axis=0)
+                    padded.append(p)
+                # Final shape: [B, max_n, ...]
+                pixel_values = np.stack(padded, axis=0) if max_n > 0 else np.zeros((0,), dtype=np.float32)
+                image_inputs = {"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}
+            else:
+                image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+                image_grid_thw = image_inputs["image_grid_thw"]
         else:
             image_inputs = {}
             image_grid_thw = None
diff --git a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
index be248a160e7d..4bfa319e7b4a 100644
--- a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
+++ b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
@@ -458,7 +458,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, _ = attention_interface(
                 self,
                 query_states,
@@ -761,10 +761,11 @@ def forward(
         Returns:
             `torch.Tensor`: hidden_states.
         """
+        input_dtype = hidden_states.dtype
         hidden_states = self.patch_embed(hidden_states)
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
-        hidden_states = hidden_states + pos_embeds
+        hidden_states = (hidden_states + pos_embeds).to(input_dtype)
 
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
 
@@ -1319,18 +1320,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
@@ -1527,7 +1528,7 @@ def load_balancing_loss_func(
         compute_device = gate_logits[0].device
         concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
 
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dtype=torch.float, dim=-1)
 
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
 
@@ -1535,7 +1536,9 @@ def load_balancing_loss_func(
 
     if attention_mask is None:
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0) / top_k
 
         # Compute the average probability of routing to these experts
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
@@ -1552,8 +1555,10 @@ def load_balancing_loss_func(
         )
 
         # Compute the percentage of tokens routed to each experts
-        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-            expert_attention_mask, dim=0
+        # Normalize by top_k so that sum(f_i) = 1, matching the distribution of P_i
+        # See: https://github.com/huggingface/transformers/issues/43688
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / (
+            torch.sum(expert_attention_mask, dim=0) * top_k
         )
 
         # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
@@ -1718,7 +1723,7 @@ def forward(
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
 
         aux_loss = None
-        if kwargs.get("output_router_logits", False):
+        if kwargs.get("output_router_logits", False) and self.training:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.config.text_config.num_experts,
diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
index 11534b395773..409f7c8ff6aa 100644
--- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
+++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
@@ -431,7 +431,7 @@ def forward(
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
 
         aux_loss = None
-        if kwargs.get("output_router_logits", False):
+        if kwargs.get("output_router_logits", False) and self.training:
             aux_loss = load_balancing_loss_func(
                 outputs.router_logits,
                 self.config.text_config.num_experts,
diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
index 6e9c072b8860..6596f5730bd9 100644
--- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
@@ -123,7 +123,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index bf891b7dbfe7..f6efcba2282f 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -106,7 +106,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
diff --git a/src/transformers/models/roberta/modular_roberta.py b/src/transformers/models/roberta/modular_roberta.py
index a215c8e7a0c7..f84173f1b49c 100644
--- a/src/transformers/models/roberta/modular_roberta.py
+++ b/src/transformers/models/roberta/modular_roberta.py
@@ -83,7 +83,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index 299d0565edc7..ea7e9e72eb08 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -102,7 +102,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
diff --git a/src/transformers/models/sam_hq/modeling_sam_hq.py b/src/transformers/models/sam_hq/modeling_sam_hq.py
index 83e558989b69..791026d1dfba 100644
--- a/src/transformers/models/sam_hq/modeling_sam_hq.py
+++ b/src/transformers/models/sam_hq/modeling_sam_hq.py
@@ -1246,11 +1246,21 @@ def __init__(self, config):
         config.mask_decoder_config._attn_implementation = config._attn_implementation
 
         self.mask_decoder = SamHQMaskDecoder(config.mask_decoder_config)
+
+        # Share positional embeddings, matching the original SAM-HQ architecture.
+        self.prompt_encoder.shared_embedding = self.shared_image_embedding
+
         self.post_init()
 
     def get_input_embeddings(self):
         return self.vision_encoder.get_input_embeddings()
 
+    def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict:
+        # The default implementation only enables tying for language-model embeddings.
+        if self._tied_weights_keys is None:
+            return {}
+        return self._tied_weights_keys.copy()
+
     def get_image_wide_positional_embeddings(self):
         size = self.config.prompt_encoder_config.image_embedding_size
         target_device = self.shared_image_embedding.positional_embedding.device
diff --git a/src/transformers/models/sam_hq/modular_sam_hq.py b/src/transformers/models/sam_hq/modular_sam_hq.py
index 5122ed9da2f6..7a6a37a68291 100644
--- a/src/transformers/models/sam_hq/modular_sam_hq.py
+++ b/src/transformers/models/sam_hq/modular_sam_hq.py
@@ -389,14 +389,27 @@ class SamHQVisionModel(SamVisionModel):
     """
 )
 class SamHQModel(SamModel):
+    _tied_weights_keys = {
+        "prompt_encoder.shared_embedding.positional_embedding": "shared_image_embedding.positional_embedding"
+    }
+
     def __init__(self, config):
         super().__init__(config)
         self.vision_encoder = SamHQVisionEncoder(config.vision_config)
 
         self.mask_decoder = SamHQMaskDecoder(config.mask_decoder_config)
 
+        # Share positional embeddings, matching the original SAM-HQ architecture.
+        self.prompt_encoder.shared_embedding = self.shared_image_embedding
+
         self.post_init()
 
+    def get_expanded_tied_weights_keys(self, all_submodels: bool = False) -> dict:
+        # The default implementation only enables tying for language-model embeddings.
+        if self._tied_weights_keys is None:
+            return {}
+        return self._tied_weights_keys.copy()
+
     @torch.no_grad()
     def get_image_embeddings(
         self,
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index e19c81de5fe6..49f3511134da 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -39,6 +39,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import ModelOutput, auto_docstring, logging
 from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import _conv_out_length
 from .configuration_seamless_m4t import SeamlessM4TConfig
 
 
@@ -2333,13 +2334,6 @@ def _get_output_hifigan_lengths(self, input_lengths: torch.LongTensor | int):
         Computes the output length of the hifigan convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return (
-                torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
-            )
-
         def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
             return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
 
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index 51a5fd456781..c612cf0e76fd 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -39,6 +39,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import ModelOutput, auto_docstring, logging
 from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import _conv_out_length
 from .configuration_seamless_m4t_v2 import SeamlessM4Tv2Config
 
 
@@ -2534,13 +2535,6 @@ def _get_output_hifigan_lengths(self, input_lengths: torch.LongTensor | int):
         Computes the output length of the hifigan convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return (
-                torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
-            )
-
         def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
             return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
 
diff --git a/src/transformers/models/seed_oss/modeling_seed_oss.py b/src/transformers/models/seed_oss/modeling_seed_oss.py
index 1ebc8f10a272..9c24af77d591 100644
--- a/src/transformers/models/seed_oss/modeling_seed_oss.py
+++ b/src/transformers/models/seed_oss/modeling_seed_oss.py
@@ -344,7 +344,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/segformer/image_processing_pil_segformer.py b/src/transformers/models/segformer/image_processing_pil_segformer.py
index f1d0bb0f627b..771d70a6365c 100644
--- a/src/transformers/models/segformer/image_processing_pil_segformer.py
+++ b/src/transformers/models/segformer/image_processing_pil_segformer.py
@@ -138,10 +138,10 @@ def _preprocess_image_like_inputs(
 
     def reduce_label(self, image: np.ndarray) -> np.ndarray:
         """Reduce label values by 1, replacing 0 with 255."""
-        # Avoid using underflow conversion
-        image[image == 0] = 255
-        image = image - 1
-        image[image == 254] = 255
+        image = image.copy()
+        ignore_mask = (image == 0) | (image == 255)
+        image[ignore_mask] = 255
+        image[~ignore_mask] = image[~ignore_mask] - 1
         return image
 
     def _preprocess(
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index efc8c312953e..616895716a3f 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -138,9 +138,10 @@ def reduce_label(self, labels: list["torch.Tensor"]) -> list["torch.Tensor"]:
         """Reduce label values by 1, replacing 0 with 255."""
         for idx in range(len(labels)):
             label = labels[idx]
-            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype, device=label.device), label)
-            label = label - 1
-            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype, device=label.device), label)
+            ignore_mask = (label == 0) | (label == 255)
+            label = label.clone()
+            label[ignore_mask] = 255
+            label[~ignore_mask] = label[~ignore_mask] - 1
             labels[idx] = label
         return labels
 
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index ea499e63289a..2b5d9ce857b1 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -36,7 +36,7 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel, get_torch_context_manager_or_global_device
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, logging
-from ...utils.generic import is_flash_attention_requested
+from ...utils.generic import _conv_out_length, is_flash_attention_requested
 from .configuration_sew import SEWConfig
 
 
@@ -551,11 +551,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/sew/modular_sew.py b/src/transformers/models/sew/modular_sew.py
index 312419793a34..7dd06f45691e 100644
--- a/src/transformers/models/sew/modular_sew.py
+++ b/src/transformers/models/sew/modular_sew.py
@@ -25,7 +25,7 @@
 from ...modeling_outputs import BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring
-from ...utils.generic import is_flash_attention_requested
+from ...utils.generic import _conv_out_length, is_flash_attention_requested
 from ..wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Attention,
     Wav2Vec2EncoderLayer,
@@ -291,11 +291,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 15323596c71a..2ba9a1306eff 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -28,6 +28,7 @@
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel, get_torch_context_manager_or_global_device
 from ...utils import auto_docstring, logging
+from ...utils.generic import _conv_out_length
 from .configuration_sew_d import SEWDConfig
 
 
@@ -1211,11 +1212,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py b/src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py
index ec02efd26a44..2a740615203d 100644
--- a/src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py
+++ b/src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py
@@ -216,8 +216,6 @@ def convert_siglip_weight(
     else:
         raise ValueError(f"Unexpected path `{path}`.")
 
-    if "vision" in normalized_path:
-        print(normalized_path)
     return normalized_path, updated_weights
 
 
diff --git a/src/transformers/models/shieldgemma2/processing_shieldgemma2.py b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
index d23be1d6f941..e54ce6469785 100644
--- a/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
+++ b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
@@ -18,6 +18,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import Unpack
 from ...utils import logging
+from ..gemma3.image_processing_gemma3 import Gemma3ImageProcessorKwargs
 from ..gemma3.processing_gemma3 import Gemma3Processor, Gemma3ProcessorKwargs
 
 
@@ -45,6 +46,7 @@
 
 
 class ShieldGemma2ProcessorKwargs(Gemma3ProcessorKwargs, total=False):
+    images_kwargs: Gemma3ImageProcessorKwargs
     policies: Sequence[str] | None
     custom_policies: Mapping[str, str] | None
     _defaults = {
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index 8309305f1fb1..6a6595c0a0ea 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -457,14 +457,19 @@ def forward(
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutput:
         hidden_states = inputs_embeds
+        all_hidden_states = [hidden_states] if self.config.output_hidden_states else None
         for encoder_layer in self.layers:
             hidden_states = encoder_layer(
                 hidden_states,
                 attention_mask,
                 **kwargs,
             )
+            if all_hidden_states:
+                all_hidden_states.append(hidden_states)
 
-        return BaseModelOutput(last_hidden_state=hidden_states)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=tuple(all_hidden_states) if all_hidden_states else None
+        )
 
 
 @auto_docstring(
diff --git a/src/transformers/models/siglip2/processing_siglip2.py b/src/transformers/models/siglip2/processing_siglip2.py
index 2315eef2d016..1b4f3249a5cc 100644
--- a/src/transformers/models/siglip2/processing_siglip2.py
+++ b/src/transformers/models/siglip2/processing_siglip2.py
@@ -17,9 +17,11 @@
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...utils import auto_docstring
+from .image_processing_siglip2 import Siglip2ImageProcessorKwargs
 
 
 class Siglip2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Siglip2ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": "max_length",
diff --git a/src/transformers/models/smollm3/modeling_smollm3.py b/src/transformers/models/smollm3/modeling_smollm3.py
index 8d911e414b0f..cc50b9f63482 100644
--- a/src/transformers/models/smollm3/modeling_smollm3.py
+++ b/src/transformers/models/smollm3/modeling_smollm3.py
@@ -103,7 +103,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/smolvlm/modeling_smolvlm.py b/src/transformers/models/smolvlm/modeling_smolvlm.py
index c2b6e23698cd..e705f1cb2b0b 100644
--- a/src/transformers/models/smolvlm/modeling_smolvlm.py
+++ b/src/transformers/models/smolvlm/modeling_smolvlm.py
@@ -508,6 +508,8 @@ def inputs_merger(
         block_idx = block_offset.unsqueeze(1) + chunk_idx
 
         image_embeds = torch.zeros_like(inputs_embeds)
+        # Ensure dtype compatibility for quantization.
+        image_hidden_states = image_hidden_states.to(dtype=inputs_embeds.dtype)
         image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
 
         merged_embeds = torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
@@ -530,7 +532,13 @@ def get_image_features(
             The attention mask indicating padded regions in the image.
         """
         batch_size, num_images, num_channels, height, width = pixel_values.shape
-        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+        # Safely get dtype, handling DataParallel case where self.dtype might raise StopIteration
+        try:
+            target_dtype = self.dtype
+        except StopIteration:
+            # Fallback to pixel_values dtype if model has no floating point parameters
+            target_dtype = pixel_values.dtype if pixel_values.is_floating_point() else torch.float32
+        pixel_values = pixel_values.to(dtype=target_dtype)  # fp16 compatibility
         pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
 
         # Remove padding images - padding images are full 0.
@@ -629,7 +637,13 @@ def forward(
             ).pooler_output
             image_hidden_states = image_hidden_states.to(inputs_embeds.device)
         elif image_hidden_states is not None:
-            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=inputs_embeds.device)
+            # Safely get dtype, handling DataParallel case where self.dtype might raise StopIteration
+            try:
+                target_dtype = self.dtype
+            except StopIteration:
+                # Fallback to image_hidden_states dtype if model has no floating point parameters
+                target_dtype = image_hidden_states.dtype if image_hidden_states.is_floating_point() else torch.float32
+            image_hidden_states = image_hidden_states.to(dtype=target_dtype, device=inputs_embeds.device)
 
         if image_hidden_states is not None:
             inputs_embeds = self.inputs_merger(
diff --git a/src/transformers/models/smolvlm/modular_smolvlm.py b/src/transformers/models/smolvlm/modular_smolvlm.py
index cf91863c56a7..535c4b6bac90 100644
--- a/src/transformers/models/smolvlm/modular_smolvlm.py
+++ b/src/transformers/models/smolvlm/modular_smolvlm.py
@@ -137,6 +137,8 @@ def inputs_merger(
         block_idx = block_offset.unsqueeze(1) + chunk_idx
 
         image_embeds = torch.zeros_like(inputs_embeds)
+        # Ensure dtype compatibility for quantization.
+        image_hidden_states = image_hidden_states.to(dtype=inputs_embeds.dtype)
         image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
 
         merged_embeds = torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
@@ -159,7 +161,13 @@ def get_image_features(
             The attention mask indicating padded regions in the image.
         """
         batch_size, num_images, num_channels, height, width = pixel_values.shape
-        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+        # Safely get dtype, handling DataParallel case where self.dtype might raise StopIteration
+        try:
+            target_dtype = self.dtype
+        except StopIteration:
+            # Fallback to pixel_values dtype if model has no floating point parameters
+            target_dtype = pixel_values.dtype if pixel_values.is_floating_point() else torch.float32
+        pixel_values = pixel_values.to(dtype=target_dtype)  # fp16 compatibility
         pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
 
         # Remove padding images - padding images are full 0.
@@ -252,7 +260,13 @@ def forward(
             ).pooler_output
             image_hidden_states = image_hidden_states.to(inputs_embeds.device)
         elif image_hidden_states is not None:
-            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=inputs_embeds.device)
+            # Safely get dtype, handling DataParallel case where self.dtype might raise StopIteration
+            try:
+                target_dtype = self.dtype
+            except StopIteration:
+                # Fallback to image_hidden_states dtype if model has no floating point parameters
+                target_dtype = image_hidden_states.dtype if image_hidden_states.is_floating_point() else torch.float32
+            image_hidden_states = image_hidden_states.to(dtype=target_dtype, device=inputs_embeds.device)
 
         if image_hidden_states is not None:
             inputs_embeds = self.inputs_merger(
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index 0107ae31cbfe..af0dd91660da 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -24,6 +24,7 @@
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import auto_docstring, is_num2words_available, logging
 from ...video_utils import VideoInput
+from .image_processing_smolvlm import SmolVLMImageProcessorKwargs
 
 
 # Adapted from transformers.models.smolvlm.video_processing_smolvlm.DEFAULT_VIDEO_INTRO
@@ -98,6 +99,7 @@ def get_image_prompt_string(
 
 
 class SmolVLMProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: SmolVLMImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/speecht5/configuration_speecht5.py b/src/transformers/models/speecht5/configuration_speecht5.py
index 82646d9f8927..f49f1692cee1 100644
--- a/src/transformers/models/speecht5/configuration_speecht5.py
+++ b/src/transformers/models/speecht5/configuration_speecht5.py
@@ -216,6 +216,7 @@ def validate_architecture(self):
                 f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
             )
 
+    @property
     def inputs_to_logits_ratio(self):
         return functools.reduce(operator.mul, self.conv_stride, 1)
 
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index f75fa9dcdcd9..04f9e22a0647 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -37,6 +37,7 @@
 )
 from ...modeling_utils import EmbeddingAccessMixin, PreTrainedModel
 from ...utils import auto_docstring, logging
+from ...utils.generic import _conv_out_length
 from .configuration_speecht5 import SpeechT5Config, SpeechT5HifiGanConfig
 
 
@@ -587,11 +588,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index 9b9e0430e985..7a3e56320dd7 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -113,7 +113,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index 8b89a1d1745c..e50b24b55f43 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -319,7 +319,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 4f4124961a92..94a6fe8298f6 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -87,12 +87,15 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tens
         # https://huggingface.co/papers/2101.03961.
         # We also store the previous dtype to cast back the output to the previous dtype
         self.input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(self.dtype)
+
+        # Create a copy for applying jitter noise so experts receive the original hidden states.
+        routing_states = hidden_states.clone().to(self.dtype)
         if self.training and self.jitter_noise > 0:
-            # Multiply the token inputs by the uniform distribution - adding some noise
-            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
+            routing_states *= torch.empty_like(routing_states).uniform_(
+                1.0 - self.jitter_noise, 1.0 + self.jitter_noise
+            )
         self.classifier = self.classifier.to(self.dtype)
-        router_logits = self.classifier(hidden_states)
+        router_logits = self.classifier(routing_states)
 
         # Apply Softmax and cast back to the original `dtype`
         router_probs = nn.functional.softmax(router_logits, dim=-1, dtype=self.dtype).to(self.input_dtype)
@@ -562,7 +565,7 @@ def _init_weights(self, module):
             init.constant_(module.weight, factor * 1.0)
         elif isinstance(
             module,
-            (SwitchTransformersModel, SwitchTransformersForConditionalGeneration, SwitchTransformersEncoderModel),
+            SwitchTransformersModel | SwitchTransformersForConditionalGeneration | SwitchTransformersEncoderModel,
         ):
             init.normal_(module.shared.weight, mean=0.0, std=factor * 1.0)
             if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
diff --git a/src/transformers/models/switch_transformers/modular_switch_transformers.py b/src/transformers/models/switch_transformers/modular_switch_transformers.py
index 5c0f253cfb78..d860ff5394d8 100644
--- a/src/transformers/models/switch_transformers/modular_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modular_switch_transformers.py
@@ -154,12 +154,15 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tens
         # https://huggingface.co/papers/2101.03961.
         # We also store the previous dtype to cast back the output to the previous dtype
         self.input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(self.dtype)
+
+        # Create a copy for applying jitter noise so experts receive the original hidden states.
+        routing_states = hidden_states.clone().to(self.dtype)
         if self.training and self.jitter_noise > 0:
-            # Multiply the token inputs by the uniform distribution - adding some noise
-            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
+            routing_states *= torch.empty_like(routing_states).uniform_(
+                1.0 - self.jitter_noise, 1.0 + self.jitter_noise
+            )
         self.classifier = self.classifier.to(self.dtype)
-        router_logits = self.classifier(hidden_states)
+        router_logits = self.classifier(routing_states)
 
         # Apply Softmax and cast back to the original `dtype`
         router_probs = nn.functional.softmax(router_logits, dim=-1, dtype=self.dtype).to(self.input_dtype)
@@ -342,7 +345,7 @@ def _init_weights(self, module):
             init.constant_(module.weight, factor * 1.0)
         elif isinstance(
             module,
-            (SwitchTransformersModel, SwitchTransformersForConditionalGeneration, SwitchTransformersEncoderModel),
+            SwitchTransformersModel | SwitchTransformersForConditionalGeneration | SwitchTransformersEncoderModel,
         ):
             init.normal_(module.shared.weight, mean=0.0, std=factor * 1.0)
             if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py
index 9de40c832259..f0f39fd37dde 100644
--- a/src/transformers/models/t5gemma/configuration_t5gemma.py
+++ b/src/transformers/models/t5gemma/configuration_t5gemma.py
@@ -168,5 +168,14 @@ def __post_init__(self, **kwargs):
 
         super().__post_init__(**kwargs)
 
+    # Bridge for generation/cache utils which expect `config.num_hidden_layers`.
+    # Prefer a top-level override if present; otherwise use the decoder's count.
+    @property
+    def num_hidden_layers(self):
+        if "num_hidden_layers" in self.__dict__:
+            return self.__dict__["num_hidden_layers"]
+        dec = getattr(self, "decoder", None)
+        return getattr(dec, "num_hidden_layers", None)
+
 
 __all__ = ["T5GemmaConfig", "T5GemmaModuleConfig"]
diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py
index 1f41875c5def..4bd8fbfa5e1b 100644
--- a/src/transformers/models/t5gemma/modeling_t5gemma.py
+++ b/src/transformers/models/t5gemma/modeling_t5gemma.py
@@ -153,7 +153,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py
index 1c8846ad74b9..c316a037d64d 100644
--- a/src/transformers/models/t5gemma/modular_t5gemma.py
+++ b/src/transformers/models/t5gemma/modular_t5gemma.py
@@ -147,6 +147,15 @@ def __post_init__(self, **kwargs):
 
         super().__post_init__(**kwargs)
 
+    # Bridge for generation/cache utils which expect `config.num_hidden_layers`.
+    # Prefer a top-level override if present; otherwise use the decoder's count.
+    @property
+    def num_hidden_layers(self):
+        if "num_hidden_layers" in self.__dict__:
+            return self.__dict__["num_hidden_layers"]
+        dec = getattr(self, "decoder", None)
+        return getattr(dec, "num_hidden_layers", None)
+
 
 class T5GemmaRMSNorm(Gemma2RMSNorm):
     pass
diff --git a/src/transformers/models/t5gemma2/modeling_t5gemma2.py b/src/transformers/models/t5gemma2/modeling_t5gemma2.py
index ad86febc367e..63d252bc20e8 100644
--- a/src/transformers/models/t5gemma2/modeling_t5gemma2.py
+++ b/src/transformers/models/t5gemma2/modeling_t5gemma2.py
@@ -913,10 +913,10 @@ def get_image_placeholder_mask(
             special_image_mask = input_ids == image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         n_image_features = image_features.shape[0] * image_features.shape[1]
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/t5gemma2/modular_t5gemma2.py b/src/transformers/models/t5gemma2/modular_t5gemma2.py
index a1b80a81ef80..8b0ba064e16a 100644
--- a/src/transformers/models/t5gemma2/modular_t5gemma2.py
+++ b/src/transformers/models/t5gemma2/modular_t5gemma2.py
@@ -701,10 +701,10 @@ def get_image_placeholder_mask(
             special_image_mask = input_ids == image_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         n_image_features = image_features.shape[0] * image_features.shape[1]
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
index b72f6be48c02..f6f056eefe7c 100644
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -17,9 +17,11 @@
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...utils import auto_docstring
+from .image_processing_tvp import TvpImageProcessorKwargs
 
 
 class TvpProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: TvpImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "truncation": True,
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index 707b5693a2d5..805512997006 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -22,6 +22,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring
+from ..layoutlmv3.image_processing_layoutlmv3 import LayoutLMv3ImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
@@ -33,6 +34,7 @@ class UdopTextKwargs(TextKwargs, total=False):
 
 
 class UdopProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: LayoutLMv3ImageProcessorKwargs
     text_kwargs: UdopTextKwargs
     _defaults = {
         "text_kwargs": {
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index e1ee81f42950..b62591c8e139 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -44,6 +44,7 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel, get_torch_context_manager_or_global_device
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils.generic import _conv_out_length
 from .configuration_unispeech import UniSpeechConfig
 
 
@@ -775,11 +776,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/unispeech/modular_unispeech.py b/src/transformers/models/unispeech/modular_unispeech.py
index 6c94a57f0973..9d9b9bc10d64 100644
--- a/src/transformers/models/unispeech/modular_unispeech.py
+++ b/src/transformers/models/unispeech/modular_unispeech.py
@@ -23,6 +23,7 @@
 from ...modeling_outputs import ModelOutput, Wav2Vec2BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, logging
+from ...utils.generic import _conv_out_length
 from ..wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Encoder,
     Wav2Vec2EncoderStableLayerNorm,
@@ -184,11 +185,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index c23fdcf16420..fbaaa7df6201 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -47,6 +47,7 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel, get_torch_context_manager_or_global_device
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, is_peft_available, logging
+from ...utils.generic import _conv_out_length
 from .configuration_unispeech_sat import UniSpeechSatConfig
 
 
@@ -781,11 +782,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
@@ -1621,11 +1617,6 @@ def _get_tdnn_output_lengths(self, input_lengths: torch.LongTensor | int):
         Computes the output length of the TDNN layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return (input_length - kernel_size) // stride + 1
-
         for kernel_size in self.config.tdnn_kernel:
             input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
 
diff --git a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py
index c445c42b9139..0ca95b4ca2b0 100644
--- a/src/transformers/models/unispeech_sat/modular_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modular_unispeech_sat.py
@@ -23,6 +23,7 @@
 from ...modeling_outputs import ModelOutput, Wav2Vec2BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, logging
+from ...utils.generic import _conv_out_length
 from ..wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Encoder,
     Wav2Vec2EncoderStableLayerNorm,
@@ -196,11 +197,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
         Computes the output length of the convolutional layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/vaultgemma/modeling_vaultgemma.py b/src/transformers/models/vaultgemma/modeling_vaultgemma.py
index f0a2e48d20b8..d0ac347779bc 100644
--- a/src/transformers/models/vaultgemma/modeling_vaultgemma.py
+++ b/src/transformers/models/vaultgemma/modeling_vaultgemma.py
@@ -329,7 +329,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/models/vibevoice_asr/convert_vibevoice_asr_to_hf.py b/src/transformers/models/vibevoice_asr/convert_vibevoice_asr_to_hf.py
index 98d208693c4f..d6eae744dac7 100644
--- a/src/transformers/models/vibevoice_asr/convert_vibevoice_asr_to_hf.py
+++ b/src/transformers/models/vibevoice_asr/convert_vibevoice_asr_to_hf.py
@@ -328,7 +328,7 @@ def convert_checkpoint(checkpoint_path, output_dir, push_to_hub, bfloat16, max_s
 
 1) Download the original VibeVoice ASR model checkpoint:
 ```bash
-huggingface-cli download microsoft/VibeVoice-ASR --local-dir /path/to/vibevoice-asr
+hf download microsoft/VibeVoice-ASR --local-dir /path/to/vibevoice-asr
 ```
 
 2) Run conversion script (with optional `push_to_hub` argument):
diff --git a/src/transformers/models/video_llama_3/modeling_video_llama_3.py b/src/transformers/models/video_llama_3/modeling_video_llama_3.py
index 26d89b313167..98e9a2bdc990 100644
--- a/src/transformers/models/video_llama_3/modeling_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/modeling_video_llama_3.py
@@ -238,7 +238,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention 2: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, attn_weights = attention_interface(
                 self,
                 query_states,
@@ -628,18 +628,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/video_llama_3/modular_video_llama_3.py b/src/transformers/models/video_llama_3/modular_video_llama_3.py
index c4a9e40bc8f0..bf42f6a3ead0 100644
--- a/src/transformers/models/video_llama_3/modular_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/modular_video_llama_3.py
@@ -226,7 +226,7 @@ def forward(
 
         if is_flash_attention_requested(self.config):
             # Flash Attention 2: Use cu_seqlens for variable length attention
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             attn_output, attn_weights = attention_interface(
                 self,
                 query_states,
diff --git a/src/transformers/models/video_llama_3/processing_video_llama_3.py b/src/transformers/models/video_llama_3/processing_video_llama_3.py
index 7916d7e41d8e..e2b8ab40db02 100644
--- a/src/transformers/models/video_llama_3/processing_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/processing_video_llama_3.py
@@ -23,12 +23,14 @@
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, logging
 from ...video_utils import VideoInput
+from .image_processing_video_llama_3 import VideoLlama3ImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
 class VideoLlama3ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: VideoLlama3ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 102ac455a47d..a1cf18804ca7 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -285,18 +285,18 @@ def get_placeholder_mask(
             special_video_mask = input_ids == self.config.video_token_id
 
         n_image_tokens = special_image_mask.sum()
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         if image_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_image_mask].numel() == image_features.numel(),
+                n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
                 f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {image_features.shape[0] * image_features.shape[1]}",
             )
 
         n_video_tokens = special_video_mask.sum()
-        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_video_mask = special_video_mask.unsqueeze(-1).to(inputs_embeds.device)
         if video_features is not None:
             torch_compilable_check(
-                inputs_embeds[special_video_mask].numel() == video_features.numel(),
+                n_video_tokens * inputs_embeds.shape[-1] == video_features.numel(),
                 f"Video features and video tokens do not match, tokens: {n_video_tokens}, features: {video_features.shape[0] * video_features.shape[1]}",
             )
         return special_image_mask, special_video_mask
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index be47b2e6ee75..cbf6bd820032 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -17,9 +17,11 @@
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...utils import auto_docstring
+from .image_processing_vilt import ViltImageProcessorKwargs
 
 
 class ViltProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: ViltImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index b09d9eff34fe..14dc0966783c 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -203,9 +203,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask
diff --git a/src/transformers/models/vit/image_processing_pil_vit.py b/src/transformers/models/vit/image_processing_pil_vit.py
index afb3ec47683a..4c50098eb6ce 100644
--- a/src/transformers/models/vit/image_processing_pil_vit.py
+++ b/src/transformers/models/vit/image_processing_pil_vit.py
@@ -18,7 +18,7 @@
 
 
 class ViTImageProcessorPil(PilBackend):
-    resample = PILImageResampling.BILINEAR
+    resample = PILImageResampling.BICUBIC
     image_mean = IMAGENET_STANDARD_MEAN
     image_std = IMAGENET_STANDARD_STD
     size = {"height": 224, "width": 224}
diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py
index 4116cc1e597c..1f63d18a108c 100644
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -18,7 +18,7 @@
 
 
 class ViTImageProcessor(TorchvisionBackend):
-    resample = PILImageResampling.BILINEAR
+    resample = PILImageResampling.BICUBIC
     image_mean = IMAGENET_STANDARD_MEAN
     image_std = IMAGENET_STANDARD_STD
     size = {"height": 224, "width": 224}
diff --git a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
index 58355f3c0d7c..f13006f6b198 100644
--- a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
@@ -203,17 +203,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 08442dad50b8..a64a69ce8a72 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -50,6 +50,7 @@
     is_peft_available,
     logging,
 )
+from ...utils.generic import _conv_out_length
 from .configuration_wav2vec2 import Wav2Vec2Config
 
 
@@ -1008,11 +1009,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
 
         add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
@@ -2048,11 +2044,6 @@ def _get_tdnn_output_lengths(self, input_lengths: torch.LongTensor | int):
         Computes the output length of the TDNN layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return (input_length - kernel_size) // stride + 1
-
         for kernel_size in self.config.tdnn_kernel:
             input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
 
diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
index 6023d856798b..37430aab1bf5 100644
--- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
@@ -28,6 +28,7 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, is_peft_available
+from ...utils.generic import _conv_out_length
 from .configuration_wav2vec2_bert import Wav2Vec2BertConfig
 
 
@@ -766,11 +767,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
 
         add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
 
-        def _conv_out_length(input_length, kernel_size, stride, padding):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1
-
         if add_adapter:
             padding = self.config.adapter_kernel_size // 2
             for _ in range(self.config.num_adapter_layers):
@@ -1431,11 +1427,6 @@ def _get_tdnn_output_lengths(self, input_lengths: torch.LongTensor | int):
         Computes the output length of the TDNN layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return (input_length - kernel_size) // stride + 1
-
         for kernel_size in self.config.tdnn_kernel:
             input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
 
diff --git a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
index 710e7a64cea2..a00816d89ff6 100644
--- a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
@@ -20,6 +20,7 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import auto_docstring, logging
+from ...utils.generic import _conv_out_length
 from ..wav2vec2.modeling_wav2vec2 import Wav2Vec2FeedForward, Wav2Vec2ForSequenceClassification, Wav2Vec2Model
 from ..wav2vec2_conformer.modeling_wav2vec2_conformer import (
     Wav2Vec2ConformerForAudioFrameClassification,
@@ -639,11 +640,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
 
         add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
 
-        def _conv_out_length(input_length, kernel_size, stride, padding):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1
-
         if add_adapter:
             padding = self.config.adapter_kernel_size // 2
             for _ in range(self.config.num_adapter_layers):
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index 354146cedb55..58a20f08a3f2 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -28,6 +28,7 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...utils import ModelOutput, auto_docstring, is_peft_available
+from ...utils.generic import _conv_out_length
 from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
 
 
@@ -911,11 +912,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
 
         add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
@@ -1837,11 +1833,6 @@ def _get_tdnn_output_lengths(self, input_lengths: torch.LongTensor | int):
         Computes the output length of the TDNN layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return (input_length - kernel_size) // stride + 1
-
         for kernel_size in self.config.tdnn_kernel:
             input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
 
diff --git a/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py
index a0bd70a14976..7ed7c0b03bb5 100644
--- a/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py
@@ -12,6 +12,7 @@
 from ...modeling_outputs import BaseModelOutput, Wav2Vec2BaseModelOutput
 from ...modeling_utils import PreTrainedModel
 from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.generic import _conv_out_length
 from ..wav2vec2.modeling_wav2vec2 import (
     Wav2Vec2Adapter,
     Wav2Vec2AdapterLayer,
@@ -610,11 +611,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
 
         add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 18440ebf7d25..73014ab9ded1 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -28,6 +28,7 @@
 )
 from ...modeling_utils import PreTrainedModel, get_torch_context_manager_or_global_device
 from ...utils import auto_docstring, is_peft_available, logging
+from ...utils.generic import _conv_out_length
 from .configuration_wavlm import WavLMConfig
 
 
@@ -644,11 +645,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int
 
         add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
-
         for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
             input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
 
@@ -1550,11 +1546,6 @@ def _get_tdnn_output_lengths(self, input_lengths: torch.LongTensor | int):
         Computes the output length of the TDNN layers
         """
 
-        def _conv_out_length(input_length, kernel_size, stride):
-            # 1D convolutional layer output length formula taken
-            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
-            return (input_length - kernel_size) // stride + 1
-
         for kernel_size in self.config.tdnn_kernel:
             input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
 
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 1f9c9843d34a..fa48c30ecf7e 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -893,6 +893,7 @@ def generate(
                     idx=i,
                     return_token_timestamps=return_token_timestamps,
                     decoder_input_ids=decoder_input_ids,
+                    max_frames=max_frames[i],
                 )
 
                 seek[prev_i] += segment_offset
@@ -1060,11 +1061,15 @@ def generate_with_fallback(
             new_decoder_input_ids = []
             new_decoder_attention_mask = []
 
+            eos_token_id = generation_config.eos_token_id
+            if isinstance(eos_token_id, int):
+                eos_token_id = [eos_token_id]
+
             for i, seek_sequence in enumerate(seek_sequences):
                 # remove all padding tokens, except for the eos token
                 if seek_sequence[-1] == generation_config.pad_token_id:
                     num_paddings = (seek_sequence == generation_config.pad_token_id).sum()
-                    if generation_config.pad_token_id == generation_config.eos_token_id:
+                    if eos_token_id is not None and generation_config.pad_token_id in eos_token_id:
                         # we do not remove the eos token id since it is needed for avg logprob calculation in _need_fallback
                         num_paddings -= 1
                     if num_paddings != 0:
@@ -1082,7 +1087,7 @@ def generate_with_fallback(
                 )
 
                 # remove eos token
-                if seek_sequence[-1] == generation_config.eos_token_id:
+                if eos_token_id is not None and seek_sequence[-1].item() in eos_token_id:
                     seek_sequence = seek_sequence[:-1]
 
                 seek_sequence_list[fallback_index_map[i]] = seek_sequence
@@ -1986,6 +1991,7 @@ def _retrieve_segment(
         idx,
         return_token_timestamps,
         decoder_input_ids,
+        max_frames,
     ):
         # find the predicted "end of segment" predictions of Whisper
         # "end of segment" predictions occur whenever Whisper predicts a timestamp token
@@ -2055,6 +2061,16 @@ def _retrieve_segment(
                 last_timestamp_pos = (timestamps[-1] - timestamp_begin).to(
                     torch.float32 if device.type == "mps" else torch.float64
                 )
+                add_time_offset = torch.round(time_offset[prev_idx] / time_precision).to(seek_sequence.dtype)
+                if (add_time_offset != 0).any():
+                    seek_sequence[timestamp_tokens] += add_time_offset
+                    # Ensure the added offset does not exceed the chunk length; otherwise, the timestamp may surpass Whisper's hard token id limit at <|30.00|>.
+                    max_timestamp_token_id = timestamp_begin + int(max_frames * 0.01 / time_precision)
+                    seek_sequence = seek_sequence.clamp(max=max_timestamp_token_id)
+                    if isinstance(seek_outputs[0], torch.Tensor):
+                        seek_outputs[idx][idx_offset : idx_offset + len(seek_sequence)] = seek_sequence
+                    elif isinstance(seek_outputs[0], dict):
+                        seek_outputs[idx]["sequences"][idx_offset : idx_offset + len(seek_sequence)] = seek_sequence
             segments = [
                 {
                     "start": time_offset[prev_idx],
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 1c56d1da765d..75da14860554 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -493,23 +493,26 @@ def decode(
             remove_diacritics=remove_diacritics,
             **kwargs,
         )
+
+        # decode/ batch decode is now unified
+        is_batch = isinstance(text, list)
+        texts = text if is_batch else [text]
+        token_ids = token_ids if is_batch else [token_ids]
+
         if decode_with_timestamps:
-            # legacy method to decode timestamps when not included in the tokenizer vocabulary
-            text = self._decode_with_timestamps(
-                filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
-            )
+            texts = [
+                self._decode_with_timestamps(t, time_precision=time_precision, skip_special_tokens=skip_special_tokens)
+                for t in texts
+            ]
         else:
-            # Handle both single string and batch (list of strings) outputs
-            if isinstance(text, list):
-                text = [self._filter_timestamp_ids(t) for t in text]
-            else:
-                text = self._filter_timestamp_ids(text)
+            texts = [self._filter_timestamp_ids(t) for t in texts]
 
-        # retrieve offsets
         if output_offsets:
-            offsets = self._compute_offsets(token_ids, time_precision=time_precision)
-            return {"text": text, "offsets": offsets}
-        return text
+            offsets = [self._compute_offsets(t, time_precision=time_precision) for t in token_ids]
+            results = [{"text": t, "offsets": o} for t, o in zip(texts, offsets)]
+            return results if is_batch else results[0]
+
+        return texts if is_batch else texts[0]
 
     def _decode(
         self, *args, normalize: bool = False, basic_normalize: bool = False, remove_diacritics: bool = False, **kwargs
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 76653e7f644c..bce50bffb07a 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -106,7 +106,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py
index 79ef73d34254..5e77ecc3d611 100644
--- a/src/transformers/models/xmod/modeling_xmod.py
+++ b/src/transformers/models/xmod/modeling_xmod.py
@@ -101,7 +101,7 @@ def forward(
         if token_type_ids is None:
             if hasattr(self, "token_type_ids"):
                 # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
-                buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
+                buffered_token_type_ids = self.token_type_ids.to(position_ids.device).expand(position_ids.shape[0], -1)
                 buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
                 token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
             else:
diff --git a/src/transformers/models/youtu/modeling_youtu.py b/src/transformers/models/youtu/modeling_youtu.py
index d40bef358da6..261f13351798 100644
--- a/src/transformers/models/youtu/modeling_youtu.py
+++ b/src/transformers/models/youtu/modeling_youtu.py
@@ -289,7 +289,7 @@ def __init__(self, config: YoutuConfig, layer_idx: int):
             self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
         else:
             self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
-            self.q_a_layernorm = YoutuRMSNorm(config.q_lora_rank)
+            self.q_a_layernorm = YoutuRMSNorm(config.q_lora_rank, eps=config.rms_norm_eps)
             self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
 
         self.kv_a_proj_with_mqa = nn.Linear(
@@ -297,7 +297,7 @@ def __init__(self, config: YoutuConfig, layer_idx: int):
             self.kv_lora_rank + self.qk_rope_head_dim,
             bias=config.attention_bias,
         )
-        self.kv_a_layernorm = YoutuRMSNorm(self.kv_lora_rank)
+        self.kv_a_layernorm = YoutuRMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
         self.kv_b_proj = nn.Linear(
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py
index 54f20adebcdf..a619dcd97aae 100644
--- a/src/transformers/models/zamba2/modeling_zamba2.py
+++ b/src/transformers/models/zamba2/modeling_zamba2.py
@@ -146,7 +146,7 @@ def forward(self, x, position_ids):
 
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (inv_freq_expanded.float() * position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
index 6e173111aa86..c03d86f70c25 100644
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@@ -183,7 +183,7 @@ def preprocess(self, inputs):
             if isinstance(inputs, torch.Tensor):
                 inputs = inputs.cpu().numpy()
 
-        if is_torchcodec_available():
+        if is_torchcodec_available() and type(inputs).__module__.startswith("torchcodec."):
             import torch
             import torchcodec
 
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 4817b4b2d37d..8d02235abce7 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -361,7 +361,7 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
             if isinstance(inputs, torch.Tensor):
                 inputs = inputs.cpu().numpy()
 
-        if is_torchcodec_available():
+        if is_torchcodec_available() and type(inputs).__module__.startswith("torchcodec."):
             import torchcodec
 
             if isinstance(inputs, torchcodec.decoders.AudioDecoder):
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 916576c4c6e9..4d210112d3b0 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -1335,7 +1335,17 @@ def check_task(self, task: str) -> tuple[str, dict, Any]:
             targeted_task = self.supported_tasks[task]
             return task, targeted_task, None
 
-        raise KeyError(f"Unknown task {task}, available tasks are {self.get_supported_tasks()}")
+        if "translation" in self.supported_tasks and task.startswith("translation"):
+            tokens = task.split("_")
+            if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
+                targeted_task = self.supported_tasks["translation"]
+                task = "translation"
+                return task, targeted_task, (tokens[1], tokens[3])
+            raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
+
+        raise KeyError(
+            f"Unknown task {task}, available tasks are {self.get_supported_tasks() + (['translation_XX_to_YY'] if 'translation' in self.supported_tasks else [])}."
+        )
 
     def register_pipeline(
         self,
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index 0a4fba996d7d..3120c528a15f 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -159,21 +159,27 @@ def unnormalize(bbox):
         else:
             # This is a regular ForObjectDetectionModel
             raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
-            raw_annotation = raw_annotations[0]
-            scores = raw_annotation["scores"]
-            labels = raw_annotation["labels"]
-            boxes = raw_annotation["boxes"]
-
-            raw_annotation["scores"] = scores.tolist()
-            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
-            raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
-
-            # {"scores": [...], ...} --> [{"score":x, ...}, ...]
+            annotations = []
             keys = ["score", "label", "box"]
-            annotation = [
-                dict(zip(keys, vals))
-                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
-            ]
+            for raw_annotation in raw_annotations:
+                scores = raw_annotation["scores"]
+                labels = raw_annotation["labels"]
+                boxes = raw_annotation["boxes"]
+
+                raw_annotation["scores"] = scores.tolist()
+                raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
+                raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
+
+                # {"scores": [...], ...} --> [{"score":x, ...}, ...]
+                annotation = [
+                    dict(zip(keys, vals))
+                    for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
+                ]
+                annotations.append(annotation)
+
+            if len(annotations) == 1:
+                return annotations[0]
+            return annotations
 
         return annotation
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index bb1344a43dcf..e04276140684 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -22,6 +22,7 @@
 import os
 import sys
 import typing
+from collections import Counter
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Annotated, Any, Literal, TypedDict, TypeVar, Union
@@ -1424,11 +1425,32 @@ def from_pretrained(
         if token is not None:
             kwargs["token"] = token
 
+        prebuilt = cls._pop_prebuilt_subprocessors(kwargs)
+
         # Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors
         processor_dict, instantiation_kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
-        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs)
+        args = cls._get_arguments_from_pretrained(
+            pretrained_model_name_or_path, processor_dict, _prebuilt=prebuilt, **kwargs
+        )
         return cls.from_args_and_dict(args, processor_dict, **instantiation_kwargs)
 
+    @classmethod
+    def _pop_prebuilt_subprocessors(cls, kwargs: dict) -> dict:
+        """Pop pre-built sub-processors from `kwargs` by exact attribute name, or by modality
+        alias (e.g. `tokenizer=` → `bpe_tokenizer`) when that modality is unambiguous.
+        """
+        sub_processors = cls.get_attributes()
+        modality_counts = Counter(_get_modality_for_attribute(s) for s in sub_processors)
+        prebuilt = {}
+        for sub_processor_type in sub_processors:
+            modality = _get_modality_for_attribute(sub_processor_type)
+            instance = kwargs.pop(sub_processor_type, None)
+            if instance is None and modality != sub_processor_type and modality_counts[modality] == 1:
+                instance = kwargs.pop(modality, None)
+            if instance is not None:
+                prebuilt[sub_processor_type] = instance
+        return prebuilt
+
     @classmethod
     def get_attributes(cls):
         args_in_init = inspect.signature(cls.__init__).parameters.keys()
@@ -1499,7 +1521,9 @@ def _load_tokenizer_from_pretrained(
         return tokenizer
 
     @classmethod
-    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs):
+    def _get_arguments_from_pretrained(
+        cls, pretrained_model_name_or_path, processor_dict=None, *, _prebuilt=None, **kwargs
+    ):
         """
         Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers,
         and feature extractors. This method inspects the processor's `__init__` signature to identify parameters
@@ -1517,15 +1541,21 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor
             pretrained_model_name_or_path: Path or model id to load from.
             processor_dict: Optional dict containing processor config (from processor_config.json).
                 Required when loading additional non-tokenizer sub-processors.
+            _prebuilt: Optional `{attribute: instance}` dict of pre-built sub-processors that skip loading.
         """
         args = []
         processor_dict = processor_dict if processor_dict is not None else {}
         # Remove subfolder from kwargs to avoid duplicate keyword arguments
         subfolder = kwargs.pop("subfolder", "")
 
+        prebuilt = _prebuilt or {}
+
         # get args from processor init signature
         sub_processors = cls.get_attributes()
         for sub_processor_type in sub_processors:
+            if sub_processor_type in prebuilt:
+                args.append(prebuilt[sub_processor_type])
+                continue
             modality = _get_modality_for_attribute(sub_processor_type)
             is_primary = sub_processor_type == modality
 
@@ -1789,6 +1819,14 @@ def apply_chat_template(
             is_batched = False
             conversations = [conversation]
 
+        # Normalize: drop `content` from assistant messages when it is None.
+        # Some APIs (e.g. OpenAI) return content=None for tool-call-only messages, but many chat templates
+        # crash or produce wrong output (e.g. rendering literal "None") when they encounter it.
+        conversations = [
+            [{k: v for k, v in msg.items() if k != "content" or v is not None} for msg in conversation]
+            for conversation in conversations
+        ]
+
         # Normalize OpenAI-style "image_url" content blocks to HuggingFace-style "image" blocks
         # OpenAI format: {"type": "image_url", "image_url": {"url": "..."}}
         # HuggingFace format: {"type": "image", "url": "..."}
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index 444c8bc457bb..4fcd8eef19a3 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -222,7 +222,7 @@ def id_tensor_storage(tensor: torch.Tensor) -> tuple[torch.device, int, int]:
 
         if isinstance(tensor, DTensor):
             local_tensor = tensor.to_local()
-            return tensor.device, local_tensor.storage().data_ptr(), tensor.nbytes
+            return tensor.device, local_tensor.untyped_storage().data_ptr(), tensor.untyped_storage().nbytes()
 
     if tensor.device.type == "xla" and is_torch_xla_available():
         # NOTE: xla tensors dont have storage
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
index 5390a9c3e8d3..57df3659af3f 100644
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@@ -57,6 +57,11 @@ def get_keys_to_not_convert(model) -> list:
     }
     modules_to_not_convert = tied_keys | last_module_key | output_emb_keys
 
+    # remove audio modules for multimodal models to prevent uint8 crash
+    for name, _ in model.named_modules():
+        if "audio_tower" in name or "embed_audio" in name:
+            modules_to_not_convert.add(name)
+
     modules_to_not_convert = list({k.removesuffix(".weight") for k in modules_to_not_convert})
 
     return list(modules_to_not_convert)
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 5e1e4882beea..cc050c146d62 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -69,8 +69,7 @@ def _process_model_before_weight_loading(self, model, **kwargs):
 
         ct_quantization_config = self.compressor.quantization_config
 
-        # Always initialize compressed wrappers to match the checkpoint
-        apply_quantization_config(model, ct_quantization_config, self.run_compressed)
+        apply_quantization_config(model, ct_quantization_config)
         if (
             self.quantization_config.is_quantization_compressed
             or self.quantization_config.is_sparsification_compressed
@@ -79,12 +78,27 @@ def _process_model_before_weight_loading(self, model, **kwargs):
 
     def _process_model_after_weight_loading(self, model, **kwargs):
         """Decompress loaded model if necessary - need for qat"""
+        from compressed_tensors import __version__ as ct_version
+        from packaging import version
 
-        if (self.quantization_config.is_quantization_compressed and not self.run_compressed) or (
-            self.quantization_config.is_sparsification_compressed
-        ):
+        if version.parse(ct_version) >= version.parse("0.14"):
+            self.compressor.decompress_model(model=model)
+        elif (
+            self.quantization_config.is_quantization_compressed and not self.run_compressed
+        ) or self.quantization_config.is_sparsification_compressed:
             self.compressor.decompress_model(model=model)
 
+    def _dequantize(self, model, dtype=None):
+        from compressed_tensors.quantization import QuantizationStatus
+
+        self.compressor.decompress_model(model=model)
+
+        for module in model.modules():
+            if hasattr(module, "quantization_status"):
+                module.quantization_status = QuantizationStatus.FROZEN
+
+        return model
+
     # NOTE: TP plan override for compressed tensors removed - unsupported styles were used.
     # TODO: Implement proper TP support for compressed tensors quantization
     def update_tp_plan(self, config):
diff --git a/src/transformers/quantizers/quantizer_hqq.py b/src/transformers/quantizers/quantizer_hqq.py
index 05dce3d996a0..43238e99e7e6 100755
--- a/src/transformers/quantizers/quantizer_hqq.py
+++ b/src/transformers/quantizers/quantizer_hqq.py
@@ -59,10 +59,16 @@ def __init__(self, quantization_config, **kwargs):
             )
         super().__init__(quantization_config, **kwargs)
         self.dtype = None
+        self.device_map = None
         self.using_multi_gpu = False
         # Keys that are serialized specifically by hqq
         self.hqq_keys = HQQLinear(None, None).state_dict_keys() - {"bias"}
 
+    def update_dtype(self, dtype):
+        if dtype is not None:
+            self.dtype = dtype
+        return dtype
+
     def validate_environment(self, *args, **kwargs):
         if self.dtype is None:
             if "dtype" in kwargs:
@@ -72,6 +78,7 @@ def validate_environment(self, *args, **kwargs):
                 logger.info("Setting dtype to torch.float32 as the default value since it was not specified.")
 
         device_map = kwargs.get("device_map")
+        self.device_map = device_map
         if isinstance(device_map, dict):
             if "cpu" in device_map.values() or "disk" in device_map.values():
                 raise ValueError(
@@ -144,10 +151,16 @@ def validate_environment(self, *args, **kwargs):
     #     return list(new_keys)
 
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
-        module, _ = get_module_from_name(model, param_name)
-        # Since we do not prepare the modules in advance, we need every param of the Linear layer to go through
-        # `create_quantized_param`, even when `self.is_quantized == True`
-        return isinstance(module, torch.nn.Linear)
+        module, tensor_name = get_module_from_name(model, param_name)
+        return isinstance(module, torch.nn.Linear) and tensor_name == "weight"
+
+    def get_quantize_ops(self):
+        from ..integrations.hqq import HqqQuantize
+
+        return HqqQuantize(self)
+
+    def get_weight_conversions(self):
+        return []
 
     # TODO: to remove
     # def create_quantized_param(
@@ -232,6 +245,47 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **
 
     #         setattr(parent_module, node, hqq_layer)
 
+    def _setup_missing_key_filters(self, model, checkpoint_files):
+        """Scan checkpoint files to find HQQ-quantized modules.
+
+        For those modules:
+        1. Suppress their .weight missing key warnings in the load report.
+        2. Replace their weight parameter with a scalar meta tensor so that
+           ``_move_missing_keys_from_meta_to_device`` does not allocate
+           full-size fp16 tensors on GPU (which would cause OOM).
+        """
+        import re
+
+        from safetensors import safe_open
+
+        quantized_modules = set()
+        for ckpt_file in checkpoint_files:
+            if ckpt_file.endswith(".safetensors"):
+                with safe_open(ckpt_file, framework="pt") as f:
+                    for k in f.keys():
+                        if k.endswith(".W_q"):
+                            quantized_modules.add(k[: -len(".W_q")])
+            else:
+                state_dict = torch.load(ckpt_file, map_location="cpu", weights_only=True)
+                for k in state_dict:
+                    if k.endswith(".W_q"):
+                        quantized_modules.add(k[: -len(".W_q")])
+
+        if quantized_modules:
+            # Build regex that matches only .weight keys of quantized modules
+            escaped = [re.escape(m) + r"\.weight" for m in quantized_modules]
+            existing = model._keys_to_ignore_on_load_missing or []
+            model._keys_to_ignore_on_load_missing = existing + escaped
+
+            # Replace weight params with scalar meta tensors to avoid GPU allocation
+            for module_name in quantized_modules:
+                try:
+                    module = model.get_submodule(module_name)
+                except AttributeError:
+                    continue
+                if hasattr(module, "weight") and module.weight is not None:
+                    module.weight = torch.nn.Parameter(torch.empty(0, device="meta"), requires_grad=False)
+
     def _patch_layer_for_multigpu(self, hqq_layer):
         def forward_with_device(self, x):
             out = torch.matmul(x.to(self.device), self.dequantize().t())
@@ -245,17 +299,133 @@ def forward_with_device(self, x):
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",
+        checkpoint_files=None,
         **kwargs,
     ):
-        # Add the corresponding quant_config to each valid module. This allows us to do the actual nn.Linear -> HQQLinear conversion in create_quantized_param().
-        # prepare_for_hqq_linear() also sets the right quantization config inside the model (model.config.quantization_config) and the layers (hqq_layer.quant_config)
-        model = prepare_for_hqq_linear(model, quantization_config=self.quantization_config)
+        if self.pre_quantized:
+            # Store checkpoint files for loading in _process_model_after_weight_loading
+            self._checkpoint_files = checkpoint_files
+
+            # Suppress noisy load report: HQQ checkpoint keys (W_q, scale, etc.) are
+            # "unexpected" and nn.Linear .weight keys are "missing" from the standard
+            # loading perspective, but _load_hqq_from_checkpoint handles them.
+            hqq_keys = HQQLinear(None, None).state_dict_keys()
+            ignore_unexpected = [rf"\.{k}$" for k in hqq_keys]
+            existing = model._keys_to_ignore_on_load_unexpected or []
+            model._keys_to_ignore_on_load_unexpected = existing + ignore_unexpected
+
+            # For missing keys: scan checkpoint to find which modules have W_q (are HQQ-quantized),
+            # and suppress only their .weight keys. Also replace their weight with a scalar meta
+            # tensor to prevent _move_missing_keys_from_meta_to_device from allocating full-size
+            # tensors on GPU (which would cause OOM for large models).
+            self._setup_missing_key_filters(model, checkpoint_files)
+        else:
+            # Add the corresponding quant_config to each valid module for on-the-fly quantization.
+            # prepare_for_hqq_linear() also sets the right quantization config inside the model
+            # (model.config.quantization_config) and the layers (hqq_layer.quant_config)
+            model = prepare_for_hqq_linear(model, quantization_config=self.quantization_config)
 
     def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        if self.pre_quantized:
+            self._load_hqq_from_checkpoint(model)
         setattr(model, "is_hqq_quantized", True)
         setattr(model, "is_hqq_serializable", self.is_serializable())
         return model
 
+    def _load_hqq_from_checkpoint(self, model: "PreTrainedModel"):
+        """Load pre-quantized HQQ weights directly from checkpoint files."""
+        from collections import defaultdict
+
+        from safetensors import safe_open
+
+        from ..integrations.hqq import autoname_modules, name_to_linear_tag
+
+        # Determine target device from stored device_map
+        device_map = getattr(self, "device_map", None)
+        if isinstance(device_map, dict):
+            # Use the first non-cpu device from the map (values can be str, int, or torch.device)
+            devices = [torch.device(v) for v in device_map.values()]
+            cuda_devices = [d for d in devices if d.type != "cpu"]
+            target_device = cuda_devices[0] if cuda_devices else torch.device("cpu")
+        elif isinstance(device_map, str) and device_map not in ("cpu", "auto"):
+            target_device = torch.device(device_map)
+        else:
+            target_device = torch.device("cpu")
+
+        autoname_modules(model)
+        skip_modules = self.quantization_config.skip_modules
+        hqq_state_dict_keys = HQQLinear(None, None).state_dict_keys()
+
+        # Find which modules should be quantized
+        quantizable_modules = {}
+        for name, module in model.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                linear_tag = name_to_linear_tag(name)
+                if linear_tag not in skip_modules:
+                    quantizable_modules[name] = module
+
+        # Load the full state dict from checkpoint files
+        full_state_dict = {}
+        for ckpt_file in self._checkpoint_files:
+            if ckpt_file.endswith(".safetensors"):
+                with safe_open(ckpt_file, framework="pt") as f:
+                    for k in f.keys():
+                        full_state_dict[k] = f.get_tensor(k)
+            else:
+                import torch as torch_
+
+                full_state_dict.update(torch_.load(ckpt_file, map_location="cpu", weights_only=True))
+
+        # Group state dict by module
+        module_states = defaultdict(dict)
+        for key, value in full_state_dict.items():
+            # Find the module this key belongs to
+            for module_name in quantizable_modules:
+                if key.startswith(module_name + "."):
+                    param_name = key[len(module_name) + 1 :]
+                    if param_name in hqq_state_dict_keys:
+                        module_states[module_name][param_name] = value
+                    break
+
+        # Replace nn.Linear with HQQLinear for each quantizable module
+        for module_name, state in module_states.items():
+            if "W_q" not in state:
+                continue
+
+            hqq_layer = HQQLinear(
+                None,
+                None,
+                compute_dtype=self.dtype or torch.float16,
+                device="cpu",
+                initialize=False,
+            )
+
+            state["W_q"] = torch.nn.Parameter(state["W_q"], requires_grad=False)
+            hqq_layer.load_state_dict(state)
+
+            # Move to the correct device (HQQLinear.to() is a no-op, use .cuda() instead)
+            if target_device.type != "cpu":
+                hqq_layer.cuda(target_device)
+
+            if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+                hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+            if self.using_multi_gpu:
+                hqq_layer = self._patch_layer_for_multigpu(hqq_layer)
+
+            parent_name, _, child_name = module_name.rpartition(".")
+            parent = model.get_submodule(parent_name) if parent_name else model
+            setattr(parent, child_name, hqq_layer)
+
+        del full_state_dict
+
+        # Free any leftover GPU memory from replaced nn.Linear modules
+        import gc
+
+        gc.collect()
+        if target_device.type != "cpu":
+            torch.cuda.empty_cache()
+
     def is_serializable(self):
         return True
 
diff --git a/src/transformers/quantizers/quantizers_utils.py b/src/transformers/quantizers/quantizers_utils.py
index 0e90e238ec4a..7c50449ff3c7 100644
--- a/src/transformers/quantizers/quantizers_utils.py
+++ b/src/transformers/quantizers/quantizers_utils.py
@@ -12,14 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-from typing import Any
 
+from torch.nn import Module
 
-def get_module_from_name(module, tensor_name: str) -> tuple[Any, str]:
-    if "." in tensor_name:
-        module_name, tensor_name = tensor_name.rsplit(".", 1)
-        module = module.get_submodule(module_name)
-    return module, tensor_name
+
+def get_module_from_name(module: Module, tensor_name: str) -> tuple[Module, str]:
+    """Split the tensor name into the module its from and the name itself."""
+    possible_modules = tensor_name.split(".")
+    current_module = module
+
+    # Iterate through the list of possible modules,
+    # checking that the next possible sub-module is an attribute of the current module
+    for i, part in enumerate(possible_modules):
+        # Check if the next segment exists and is a Module
+        next_attribute = getattr(current_module, part, None)
+
+        if isinstance(next_attribute, Module):
+            current_module = next_attribute
+        else:
+            # We hit a non-module (Parameter, Buffer, or nested attribute)
+            # Everything from this point forward is the parameter name
+            param_name = ".".join(possible_modules[i:])
+            return current_module, param_name
+
+    return current_module, ""
 
 
 def should_convert_module(full_name, patterns: list[str] | None = None):
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 863242a695c6..049e3020702d 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -116,6 +116,7 @@
     is_liger_kernel_available,
     is_lomo_available,
     is_mistral_common_available,
+    is_mlx_available,
     is_multipart_available,
     is_natten_available,
     is_nltk_available,
@@ -173,6 +174,7 @@
     is_vptq_available,
     strtobool,
 )
+from .utils.import_utils import is_causal_conv1d_available, is_flash_linear_attention_available
 
 
 if is_accelerate_available():
@@ -706,6 +708,32 @@ def require_all_flash_attn(test_case):
     )(test_case)
 
 
+def require_flash_linear_attention(test_case):
+    """
+    Decorator marking a test that requires Flash Linear Attention.
+
+    These tests are skipped when Flash Linear Attention isn't installed.
+    """
+
+    return unittest.skipUnless(
+        is_flash_linear_attention_available(),
+        "test requires `flash-linear-attention`",
+    )(test_case)
+
+
+def require_causal_conv1d(test_case):
+    """
+    Decorator marking a test that requires causal-conv1d.
+
+    These tests are skipped when causal-conv1d isn't installed.
+    """
+
+    return unittest.skipUnless(
+        is_causal_conv1d_available(),
+        "test requires `causal-conv1d`",
+    )(test_case)
+
+
 def require_peft(test_case):
     """
     Decorator marking a test that requires PEFT.
@@ -1526,6 +1554,13 @@ def require_mistral_common(test_case):
     return unittest.skipUnless(is_mistral_common_available(), "test requires mistral-common")(test_case)
 
 
+def require_mlx(test_case):
+    """
+    Decorator marking a test that requires mlx
+    """
+    return unittest.skipUnless(is_mlx_available(), "test requires mlx")(test_case)
+
+
 def get_gpu_count():
     """
     Return the number of available gpus
@@ -3204,17 +3239,13 @@ def get_device_properties() -> DeviceProperties:
     """
     Get environment device properties.
     """
-    if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
-        import torch
-
+    if (IS_CUDA_SYSTEM or IS_ROCM_SYSTEM) and torch.cuda.is_available():
         major, minor = torch.cuda.get_device_capability()
         if IS_ROCM_SYSTEM:
             return ("rocm", major, minor)
         else:
             return ("cuda", major, minor)
     elif IS_XPU_SYSTEM:
-        import torch
-
         # To get more info of the architecture meaning and bit allocation, refer to https://github.com/intel/llvm/blob/sycl/sycl/include/sycl/ext/oneapi/experimental/device_architecture.def
         arch = torch.xpu.get_device_capability()["architecture"]
         gen_mask = 0x000000FF00000000
@@ -3525,13 +3556,34 @@ def get_argument_name(node):
     return None
 
 
+def _get_patched_testing_methods_output_path() -> Path:
+    """Return the output path used by patched testing methods.
+
+    When `pytest-xdist` is enabled, each worker writes to its own file to avoid cross-worker clobbering.
+    """
+
+    output_dir = Path(os.environ.get("_PATCHED_TESTING_METHODS_OUTPUT_DIR", ""))
+    worker_id = os.environ.get("PYTEST_XDIST_WORKER")
+    filename = "captured_info.txt" if worker_id is None else f"captured_info_{worker_id}.txt"
+    return output_dir / filename
+
+
+def _clear_patched_testing_methods_output_files():
+    """Remove stale output files before patched testing methods start collecting info."""
+
+    output_dir = Path(os.environ.get("_PATCHED_TESTING_METHODS_OUTPUT_DIR", ""))
+    if os.environ.get("PYTEST_XDIST_WORKER") is None:
+        for path in output_dir.glob("captured_info*.txt"):
+            path.unlink(missing_ok=True)
+    else:
+        _get_patched_testing_methods_output_path().unlink(missing_ok=True)
+
+
 def _prepare_debugging_info(test_info, info):
     """Combine the information about the test and the call information to a patched function/method within it."""
 
     info = f"{test_info}\n\n{info}"
-    p = os.path.join(os.environ.get("_PATCHED_TESTING_METHODS_OUTPUT_DIR", ""), "captured_info.txt")
-    # TODO (ydshieh): This is not safe when we use pytest-xdist with more than 1 worker.
-    with open(p, "a") as fp:
+    with open(_get_patched_testing_methods_output_path(), "a") as fp:
         fp.write(f"{info}\n\n{'=' * 120}\n\n")
 
     return info
@@ -3761,8 +3813,7 @@ def patch_testing_methods_to_collect_info():
     This will allow us to collect the call information, e.g. the argument names and values, also the literal expressions
     passed as the arguments.
     """
-    p = os.path.join(os.environ.get("_PATCHED_TESTING_METHODS_OUTPUT_DIR", ""), "captured_info.txt")
-    Path(p).unlink(missing_ok=True)
+    _get_patched_testing_methods_output_path().unlink(missing_ok=True)
 
     if is_torch_available():
         import torch
diff --git a/src/transformers/tokenization_mistral_common.py b/src/transformers/tokenization_mistral_common.py
index 85fdeea1850f..bc212ee9bd05 100644
--- a/src/transformers/tokenization_mistral_common.py
+++ b/src/transformers/tokenization_mistral_common.py
@@ -39,7 +39,12 @@
 
 
 if is_mistral_common_available():
-    from mistral_common.protocol.instruct.request import ChatCompletionRequest, ReasoningEffort
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+
+    try:
+        from mistral_common.protocol.instruct.request import ReasoningEffort
+    except ImportError:
+        from typing import Any as ReasoningEffort  # type: ignore[assignment]
     from mistral_common.protocol.instruct.validator import ValidationMode
     from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy, SpecialTokens
     from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 4e821dfd4e70..867bcfba5f8c 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1070,10 +1070,10 @@ def __init__(self, **kwargs):
                 f"Truncation side should be selected between 'right' and 'left', current value: {self.truncation_side}"
             )
 
-        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
+        self.model_input_names = list(kwargs.pop("model_input_names", self.model_input_names))
 
-        # By default, do not clean up tokenization spaces for both fast and slow tokenizers
-        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
+        # By default, cleaning tokenization spaces for both fast and slow tokenizers
+        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
         self.clean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_output = kwargs.pop(
             "clean_up_tokenization_spaces_for_bpe_even_though_it_will_corrupt_output", False
         )
@@ -1405,7 +1405,7 @@ def all_special_ids(self) -> list[int]:
         """
         return self.convert_tokens_to_ids(self.all_special_tokens)
 
-    def _set_model_specific_special_tokens(self, special_tokens: dict[str, str | AddedToken]):
+    def _set_model_specific_special_tokens(self, special_tokens: dict[str, str | AddedToken] | list[str]):
         """
         Adds new model-specific special tokens (e.g., for multimodal models).
 
@@ -1413,8 +1413,11 @@ def _set_model_specific_special_tokens(self, special_tokens: dict[str, str | Add
         For example: if the model tokenizer is multimodal, we can support special image or audio tokens.
 
         Args:
-            special_tokens: Dictionary of {token_name: token_value}
+            special_tokens: Dictionary of {token_name: token_value}, or a list of token strings.
+                If a list is provided, each token is used as both the attribute name and value.
         """
+        if isinstance(special_tokens, list):
+            special_tokens = {tok: tok for tok in special_tokens}
         self.SPECIAL_TOKENS_ATTRIBUTES = self.SPECIAL_TOKENS_ATTRIBUTES + list(special_tokens.keys())
         for key, value in special_tokens.items():
             if isinstance(value, (str, AddedToken)):
@@ -1700,6 +1703,13 @@ def from_pretrained(
                 else:
                     vocab_files["vocab_file"] = match.group()
 
+        error_message = (
+            f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+            "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+            f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+            f"containing all relevant files for a {cls.__name__} tokenizer."
+        )
+
         resolved_vocab_files = {}
         for file_id, file_path in vocab_files.items():
             if file_path is None:
@@ -1728,17 +1738,19 @@ def from_pretrained(
                     raise
                 except Exception:
                     # For any other exception, we throw a generic error.
-                    raise OSError(
-                        f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                        "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                        f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                        f"containing all relevant files for a {cls.__name__} tokenizer."
-                    )
+                    raise OSError(error_message)
                 commit_hash = extract_commit_hash(resolved_vocab_files[file_id], commit_hash)
 
-        for file_id, file_path in vocab_files.items():
-            if file_id not in resolved_vocab_files:
-                continue
+        loadable_file_ids = set(cls.vocab_files_names)
+        if loadable_file_ids and "tokenizer_file" in resolved_vocab_files:
+            loadable_file_ids.add("tokenizer_file")
+        loadable_file_ids.intersection_update(resolved_vocab_files)
+        if (
+            (local_files_only or is_local)
+            and loadable_file_ids
+            and all(resolved_vocab_files[file_id] is None for file_id in loadable_file_ids)
+        ):
+            raise OSError(error_message)
 
         return cls._from_pretrained(
             resolved_vocab_files,
@@ -3074,6 +3086,14 @@ def apply_chat_template(
             conversations = [conversation]
             is_batched = False
 
+        # Normalize: drop `content` from assistant messages when it is None.
+        # Some APIs (e.g. OpenAI) return content=None for tool-call-only messages, but many chat templates
+        # crash or produce wrong output (e.g. rendering literal "None") when they encounter it.
+        conversations = [
+            [{k: v for k, v in msg.items() if k != "content" or v is not None} for msg in conversation]
+            for conversation in conversations
+        ]
+
         if continue_final_message:
             if add_generation_prompt:
                 raise ValueError(
diff --git a/src/transformers/tokenization_utils_tokenizers.py b/src/transformers/tokenization_utils_tokenizers.py
index a700f0ad27cc..be8fa02ac170 100644
--- a/src/transformers/tokenization_utils_tokenizers.py
+++ b/src/transformers/tokenization_utils_tokenizers.py
@@ -106,6 +106,7 @@ def convert_to_native_format(cls, trust_remote_code=False, **kwargs):
         """
         # Preserve kwargs for possible downstream use
         local_kwargs = dict(kwargs)
+        override_tokenizer = local_kwargs.get("override_tokenizer", False)
         fast_tokenizer_file = local_kwargs.pop("tokenizer_file", None)
 
         if (
@@ -192,6 +193,9 @@ def convert_to_native_format(cls, trust_remote_code=False, **kwargs):
                 merges = [tuple(merge.split(" ")) if isinstance(merge, str) else tuple(merge) for merge in merges]
                 local_kwargs["merges"] = merges
 
+            if override_tokenizer:
+                local_kwargs["tokenizer_file"] = fast_tokenizer_file
+
             return local_kwargs
 
         vocab_file = local_kwargs.get("vocab_file")
@@ -334,6 +338,8 @@ def __init__(self, *args, **kwargs):
         # (before calling super().__init__) and should not be stored in `init_kwargs` to keep the tokenizer  serializable.
         kwargs.pop("_spm_precompiled_charsmap", None)
 
+        override_tokenizer = kwargs.pop("override_tokenizer", False)
+
         tokenizer_object = kwargs.pop("tokenizer_object", None)
         gguf_file = kwargs.pop("gguf_file", None)
         fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
@@ -347,11 +353,15 @@ def __init__(self, *args, **kwargs):
         merges = kwargs.get("merges")
 
         fast_tokenizer = None
+        serialized_tokenizer = None
         if tokenizer_object is not None:
             fast_tokenizer = copy.deepcopy(tokenizer_object)
         elif fast_tokenizer_file is not None and os.path.isfile(fast_tokenizer_file):
             # We have a serialization from tokenizers which let us directly build the backend
-            fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
+            if self.__class__ is TokenizersBackend or self._tokenizer is None or not override_tokenizer:
+                fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
+            else:
+                serialized_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
         elif gguf_file is not None:
             # We need to convert a slow tokenizer to build the backend
             gguf_path = cached_file(kwargs.get("name_or_path", ""), gguf_file, **kwargs)
@@ -391,6 +401,21 @@ def __init__(self, *args, **kwargs):
         if self._tokenizer is None:
             raise ValueError("The backend tokenizer is not correctly initialized.")
 
+        # Optionally override subclass-created tokenizers with the serialized tokenizer file.
+        if override_tokenizer and serialized_tokenizer is not None:
+
+            def _sig(tok: TokenizerFast):
+                return tuple(
+                    type(getattr(tok, attr, None))
+                    for attr in ("normalizer", "pre_tokenizer", "decoder", "post_processor", "model")
+                )
+
+            if _sig(self._tokenizer) != _sig(serialized_tokenizer):
+                self._tokenizer = serialized_tokenizer
+                logger.warning(
+                    "Tokenizer pipeline differs from serialized tokenizer; overriding with the serialized definition."
+                )
+
         _truncation = kwargs.pop("tokenizer_truncation", None) or self._tokenizer.truncation or _json_truncation
         if _truncation is not None:
             self._tokenizer.enable_truncation(**_truncation)
@@ -446,9 +471,11 @@ def __init__(self, *args, **kwargs):
                 tokens_to_add.append(special_token_value)
 
         # Also check extra special tokens
+        tokens_to_add_str = {str(t) for t in tokens_to_add}
         for token in self._extra_special_tokens:
-            if str(token) not in encoder and token not in tokens_to_add:
+            if str(token) not in encoder and str(token) not in tokens_to_add_str:
                 tokens_to_add.append(token)
+                tokens_to_add_str.add(str(token))
 
         if len(tokens_to_add) > 0:
             tokens = []
@@ -1313,7 +1340,11 @@ def is_base_mistral(model_id: str) -> bool:
                     return True
             return False
 
-        if local_files_only or is_offline_mode():
+        if (
+            is_offline_mode()
+            or local_files_only
+            or (pretrained_model_name_or_path is not None and os.path.isdir(pretrained_model_name_or_path))
+        ):
             is_local = True
 
         if pretrained_model_name_or_path is not None and (
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f434d78d4040..122ed2b0ba69 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -70,6 +70,7 @@
 from .integrations.liger import apply_liger_kernel
 from .integrations.neftune import activate_neftune, deactivate_neftune
 from .integrations.peft import MIN_PEFT_VERSION
+from .integrations.tensor_parallel import get_ep_sharded_param_names
 from .integrations.tpu import save_tpu_checkpoint, tpu_spmd_dataloader, wrap_model_xla_fsdp
 from .modelcard import TrainingSummary
 from .modeling_utils import PreTrainedModel, unwrap_model
@@ -103,9 +104,11 @@
     LengthGroupedSampler,
     distributed_broadcast_scalars,
     find_batch_size,
+    flatten_per_sample_nested_batches,
     get_model_param_count,
     get_parameter_names,
     is_attention_mask_causal,
+    is_per_sample_nested,
     nested_detach,
     nested_gather,
     reissue_pt_warnings,
@@ -397,6 +400,12 @@ def __init__(
             output_dir = "tmp_trainer"
             logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.")
             args = TrainingArguments(output_dir=output_dir)
+        if args.label_names == ["label"]:
+            logger.warning(
+                "Setting label_names=['label'] is redundant and may cause issues. "
+                "Removing it to use automatic label detection."
+            )
+            args.label_names = None
         self.args = args
         # Seed must be set before instantiating the model when using model_init
         enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
@@ -726,7 +735,12 @@ def _build_accelerator_args(self, **kwargs) -> dict[str, Any]:
                 )
             args["parallelism_config"] = self.args.parallelism_config
 
-        if getattr(self.model, "tp_size", None) is not None and self.model.tp_size > 1:
+        # EP-sharded params are already DTensors on the EP mesh, not on a TP mesh.
+        if (
+            getattr(self.model, "tp_size", None) is not None
+            and self.model.tp_size > 1
+            and not getattr(self.model, "has_ep", False)
+        ):
             if self.args.parallelism_config is None:
                 if is_accelerate_available("1.12.0"):
                     if self.args.parallelism_config is None:
@@ -823,6 +837,11 @@ def create_accelerator_and_postprocess(self) -> None:
         # post accelerator creation setup
         if self.is_fsdp_enabled:
             fsdp_plugin = self.accelerator.state.fsdp_plugin
+            # EP-sharded experts must not be re-sharded by FSDP,  their params are DTensors on the EP mesh.
+            ep_param_names = get_ep_sharded_param_names(self.model)
+            if ep_param_names:
+                module_names = list({n.rsplit(".", 1)[0] for n in ep_param_names})
+                fsdp_plugin.ignored_modules = [self.model.get_submodule(n) for n in module_names]
             for param in ["limit_all_gathers", "activation_checkpointing"]:
                 setattr(fsdp_plugin, param, self.args.fsdp_config.get(param, getattr(fsdp_plugin, param)))
             if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing:
@@ -853,13 +872,6 @@ def create_accelerator_and_postprocess(self) -> None:
             raise ValueError(
                 "`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP"
             )
-        if (
-            self.args.save_only_model
-            and self.is_fsdp_enabled
-            and "SHARDED_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)
-        ):
-            raise ValueError("save_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'")
-
     # ---- Data Loading ----
 
     def get_train_dataloader(self) -> DataLoader:
@@ -1371,7 +1383,15 @@ def train(
 
         # When fp16/bf16 full eval is enabled, __init__ skips device placement so that
         # evaluation_loop can cast dtype and move in one step. Move the model now for training.
-        if (args.fp16_full_eval or args.bf16_full_eval) and not self.is_model_parallel and self.model_init is None:
+        if (
+            (args.fp16_full_eval or args.bf16_full_eval)
+            and not self.is_model_parallel
+            and not self.is_deepspeed_enabled
+            and not self.is_fsdp_xla_enabled
+            and not self.is_fsdp_enabled
+            and not is_sagemaker_mp_enabled()
+            and self.model_init is None
+        ):
             self._move_model_to_device(self.model, args.device)
 
         # Activate gradient checkpointing if needed
@@ -1405,7 +1425,7 @@ def train(
         if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
             resume_from_checkpoint = get_last_checkpoint(args.output_dir)
             if resume_from_checkpoint is None:
-                raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})")
+                logger.warning(f"No valid checkpoint found in output directory ({args.output_dir})")
 
         if resume_from_checkpoint is not None:
             # Load model checkpoint before accelerator.prepare() for regular models,
@@ -1413,10 +1433,17 @@ def train(
             # Deepspeed/FSDP models are loaded after prepare in _prepare_for_training.
             if not is_sagemaker_mp_enabled() and not self.is_deepspeed_enabled and not self.is_fsdp_enabled:
                 self._load_from_checkpoint(resume_from_checkpoint)
-            state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
-            if state.train_batch_size is not None and args.auto_find_batch_size:
-                # Only restore the checkpoint's train_batch_size when using auto_find_batch_size,
-                self._train_batch_size = state.train_batch_size
+            state_path = os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
+            if os.path.isfile(state_path):
+                state = TrainerState.load_from_json(state_path)
+                if state.train_batch_size is not None and args.auto_find_batch_size:
+                    # Only restore the checkpoint's train_batch_size when using auto_find_batch_size,
+                    self._train_batch_size = state.train_batch_size
+            else:
+                logger.warning(
+                    f"TrainerState file not found at {state_path}. "
+                    "Training will continue, but batch size recovery is skipped."
+                )
 
         inner_training_loop = find_executable_batch_size(
             self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
@@ -1557,6 +1584,16 @@ def _init_training_state(
 
     def _prepare_for_training(self, max_steps, train_dataloader, resume_from_checkpoint):
         """Wrap model, create optimizer and scheduler, and run accelerator.prepare. Returns (model, train_dataloader)."""
+        # DeepSpeed: clear stale inference engine refs left by evaluate()/predict()
+        # so that _wrap_model() and accelerator.prepare() can create a training engine.
+        if (
+            self.is_deepspeed_enabled
+            and self.accelerator.deepspeed_engine_wrapped is None
+            and self.model_wrapped is not self.model
+        ):
+            self.model_wrapped = self.model
+            self.deepspeed = None
+
         delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
 
         # Can't delay optimizer creation when using FSDP2: https://github.com/huggingface/accelerate/blob/3f636d626063ffcf9a337c7d3624d61b7d187d59/src/accelerate/accelerator.py#L1404
@@ -1924,9 +1961,7 @@ def training_step(
             if self.args.n_gpu > 1:
                 loss = loss.mean()  # mean() to average on multi-gpu parallel training
 
-            # Finally we need to normalize the loss for reporting if GA loss bug is not fixed during compute loss
-            if (not self.model_accepts_loss_kwargs or num_items_in_batch is None) and self.compute_loss_func is None:
-                # If the model does not accept loss kwargs, we need to normalize the loss by the number of gradient accumulation steps
+            if not self._loss_is_scaled_for_ga or num_items_in_batch is None:
                 loss = loss / self.current_gradient_accumulation_steps
 
             # Turning off loss scaling w.r.t. gradient accumulation when DeepSpeed is enabled
@@ -1938,6 +1973,16 @@ def training_step(
 
             return loss.detach()
 
+    @property
+    def _loss_is_scaled_for_ga(self) -> bool:
+        """
+        Whether compute_loss returns a loss already scaled for gradient accumulation.
+
+        Override to return False if you implement custom compute_loss that needs
+        the Trainer to handle gradient accumulation scaling.
+        """
+        return self.model_accepts_loss_kwargs or self.compute_loss_func is not None
+
     def compute_loss(
         self,
         model: nn.Module,
@@ -1962,8 +2007,8 @@ def compute_loss(
         Returns:
             The loss of the model along with its output if return_outputs was set to True
 
-        Subclass and override for custom behavior. If you are not using `num_items_in_batch` when computing your loss,
-        make sure to overwrite `self.model_accepts_loss_kwargs` to `False`. Otherwise, the loss calculation might be slightly inaccurate when performing gradient accumulation.
+        Subclass and override for custom behavior. If you compute your own loss and need the Trainer to handle
+        gradient accumulation scaling, override `_loss_is_scaled_for_ga` to return `False`.
         """
         pc = getattr(self.accelerator, "parallelism_config", None)
         if pc is not None and pc.sp_backend == "deepspeed" and pc.sp_enabled and self.model.training:
@@ -2018,7 +2063,7 @@ def compute_loss(
             and (self.model_accepts_loss_kwargs or self.compute_loss_func)
             and num_items_in_batch is not None
         ):
-            loss *= self.accelerator.num_processes if self.args.n_gpu <= 1 else self.args.n_gpu
+            loss *= self.accelerator.state.num_data_parallel_processes if self.args.n_gpu <= 1 else self.args.n_gpu
 
         return (loss, outputs) if return_outputs else loss
 
@@ -2060,7 +2105,7 @@ def _maybe_log_save_evaluate(
             logs: dict[str, float] = {}
 
             # all_gather + mean() to get average loss over all processes
-            tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).mean().item()
+            tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).nanmean().item()
 
             # reset tr_loss to zero
             tr_loss -= tr_loss
@@ -2086,6 +2131,8 @@ def _maybe_log_save_evaluate(
 
             if self.args.save_strategy == SaveStrategy.BEST:
                 self.control.should_save = is_new_best_metric
+            elif is_new_best_metric and self.args.load_best_model_at_end:
+                self.control.should_save = True
 
         if self.control.should_save:
             self._save_checkpoint(model, trial)
@@ -2629,31 +2676,52 @@ def evaluation_loop(
 
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
 
-        # if eval is called w/o train, handle model prep here
+        # if eval is called without train, handle model prep here
+        _ds_config_mutated = False
+        _need_ds_eval_engine = False
         if self.is_deepspeed_enabled and self.deepspeed is None:
-            _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
+            hf_deepspeed_config = self.accelerator.state.deepspeed_plugin.hf_ds_config
+            # Only ZeRO-3 needs a DS inference engine (params are partitioned across GPUs).
+            # ZeRO-1/2 keep full params on each GPU and can eval without one.
+            _need_ds_eval_engine = hf_deepspeed_config.is_zero3()
+            if _need_ds_eval_engine:
+                # deepspeed_init(inference=True) mutates shared config (deletes optimizer,
+                # bakes scheduler "auto" to 0). Back up and restore after prepare().
+                import copy
+
+                _ds_config = hf_deepspeed_config.config
+                _saved_optimizer = copy.deepcopy(_ds_config.get("optimizer"))
+                _saved_sched_params = copy.deepcopy(_ds_config.get("scheduler", {}).get("params"))
+                _ds_config_mutated = True
+                _, _ = deepspeed_init(self, num_training_steps=0, inference=True)
 
         model = self._wrap_model(self.model, training=False)
 
-        if len(self.accelerator._models) == 0 and model is self.model:
-            start_time = time.time()
-            model = (
-                self.accelerator.prepare(model)
-                if self.is_deepspeed_enabled or (self.is_fsdp_enabled and not self.args.torch_compile)
-                else self.accelerator.prepare_model(model, evaluation_mode=True)
-            )
-            self.model_preparation_time = round(time.time() - start_time, 4)
+        try:
+            if len(self.accelerator._models) == 0 and model is self.model:
+                start_time = time.time()
+                if _need_ds_eval_engine or self.deepspeed is not None:
+                    model = self.accelerator.prepare(model)
+                elif self.is_fsdp_enabled and not self.args.torch_compile:
+                    model = self.accelerator.prepare(model)
+                else:
+                    model = self.accelerator.prepare_model(model, evaluation_mode=True)
+                self.model_preparation_time = round(time.time() - start_time, 4)
 
-            if self.is_fsdp_enabled:
-                self.model = model
+                if self.is_fsdp_enabled:
+                    self.model = model
 
-            # for the rest of this function `model` is the outside model, whether it was wrapped or not
-            if model is not self.model:
-                self.model_wrapped = model
+                if model is not self.model:
+                    self.model_wrapped = model
 
-            # backward compatibility
-            if self.is_deepspeed_enabled:
-                self.deepspeed = self.model_wrapped
+                if self.is_deepspeed_enabled and _need_ds_eval_engine:
+                    self.deepspeed = self.model_wrapped
+        finally:
+            if _ds_config_mutated:
+                if _saved_optimizer is not None:
+                    _ds_config["optimizer"] = _saved_optimizer
+                if _saved_sched_params is not None:
+                    _ds_config.setdefault("scheduler", {})["params"] = _saved_sched_params
 
         # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called
         # while ``train`` is running, cast it to the right dtype first and then put on device
@@ -2686,6 +2754,8 @@ def evaluation_loop(
         all_preds = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
         all_labels = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
         all_inputs = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100)
+        # Separate list for per-sample nested labels (e.g., Mask2Former)
+        per_sample_nested_labels = []
 
         metrics = None
         eval_set_kwargs = {}
@@ -2715,14 +2785,16 @@ def evaluation_loop(
 
             # Update containers
             if losses is not None:
-                losses = self.gather_function(losses.repeat(batch_size))
+                losses = self.gather_function(losses.repeat(observed_batch_size))
                 all_losses.add(losses)
             if inputs_decode is not None:
                 inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100)
                 inputs_decode = self.gather_function(inputs_decode)
                 if not self.args.batch_eval_metrics or description == "Prediction":
                     all_inputs.add(inputs_decode)
-            if labels is not None:
+            # Check if labels have per-sample nested structure (e.g., Mask2Former's tuple[list[Tensor], ...])
+            labels_are_per_sample_nested = labels is not None and is_per_sample_nested(labels)
+            if labels is not None and not labels_are_per_sample_nested:
                 # Pad labels here, preparing for preprocess_logits_for_metrics in next logits block.
                 labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
             if logits is not None:
@@ -2733,9 +2805,15 @@ def evaluation_loop(
                 if not self.args.batch_eval_metrics or description == "Prediction":
                     all_preds.add(logits)
             if labels is not None:
-                labels = self.gather_function(labels)
-                if not self.args.batch_eval_metrics or description == "Prediction":
-                    all_labels.add(labels)
+                if labels_are_per_sample_nested:
+                    # Per-sample nested: gather from all processes, then accumulate
+                    # Use gather_object directly to avoid incorrect truncation in gather_for_metrics
+                    gathered_labels = self.accelerator.gather_object(labels)
+                    per_sample_nested_labels.extend(gathered_labels)
+                else:
+                    labels = self.gather_function(labels)
+                    if not self.args.batch_eval_metrics or description == "Prediction":
+                        all_labels.add(labels)
 
             self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
 
@@ -2787,6 +2865,10 @@ def evaluation_loop(
         if num_samples == 0 and observed_num_examples > 0:
             num_samples = observed_num_examples
 
+        # Handle per-sample nested labels (e.g., Mask2Former)
+        if per_sample_nested_labels:
+            all_labels = flatten_per_sample_nested_batches(per_sample_nested_labels, num_samples)
+
         # Metrics!
         if (
             self.compute_metrics is not None
@@ -2806,9 +2888,9 @@ def evaluation_loop(
         metrics = denumpify_detensorize(metrics)
 
         if isinstance(all_losses, list) and all_losses:
-            metrics[f"{metric_key_prefix}_loss"] = np.concatenate(all_losses).mean().item()
+            metrics[f"{metric_key_prefix}_loss"] = np.nanmean(np.concatenate(all_losses)).item()
         elif isinstance(all_losses, np.ndarray):
-            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
+            metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()
         if hasattr(self, "model_preparation_time"):
             metrics[f"{metric_key_prefix}_model_preparation_time"] = self.model_preparation_time
 
@@ -2934,6 +3016,20 @@ def prediction_step(
         else:
             labels = None
 
+        # Enable Liger fused loss path during eval when we only need the loss (no logits).
+        if (
+            prediction_loss_only
+            and getattr(self.args, "use_liger_kernel", False)
+            and inputs.get("labels") is not None
+            and "skip_logits" not in inputs
+        ):
+            try:
+                forward_sig = inspect.signature(unwrap_model(model).forward)
+                if "skip_logits" in forward_sig.parameters:
+                    inputs["skip_logits"] = True
+            except (TypeError, ValueError):
+                pass
+
         with torch.no_grad():
             if is_sagemaker_mp_enabled():
                 raw_outputs = smp_forward_only(model, inputs)
@@ -3036,7 +3132,9 @@ def _get_output_dir(self, trial: "optuna.Trial | dict[str, Any] | None") -> str:
             run_dir = self.args.output_dir
         return run_dir
 
-    def _save_checkpoint(self, model: nn.Module, trial: "optuna.Trial | dict[str, Any] | None") -> None:
+    def _save_checkpoint(
+        self, model: nn.Module, trial: "optuna.Trial | dict[str, Any] | None", save_latest: bool = True
+    ) -> None:
         """Save model checkpoint, optimizer, scheduler, scaler, RNG states, and trainer state."""
         # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
         # want to save except FullyShardedDDP.
@@ -3105,6 +3203,10 @@ def _save_checkpoint(self, model: nn.Module, trial: "optuna.Trial | dict[str, An
                 use_mtime=True,
             )
 
+        if save_latest and self.is_world_process_zero():
+            with open(os.path.join(run_dir, "latest"), "w") as fd:
+                fd.write(checkpoint_folder)
+
     def _determine_best_metric(self, metrics: dict[str, float], trial: "optuna.Trial | dict[str, Any] | None") -> bool:
         """
         Determine if the model should be saved based on the evaluation metrics.
@@ -3761,7 +3863,13 @@ def save_model(self, output_dir: str | None = None, _internal_call: bool = False
 
         if is_torch_xla_available():
             save_tpu_checkpoint(
-                self.model, self.args, self.accelerator, self.processing_class, self.is_fsdp_xla_v1_enabled, output_dir
+                self.model,
+                self.args,
+                self.accelerator,
+                self.processing_class,
+                self.is_fsdp_xla_v1_enabled,
+                self.is_fsdp_xla_v2_enabled,
+                output_dir,
             )
         elif is_sagemaker_mp_enabled():
             # Calling the state_dict needs to be done on the wrapped model and on all processes.
@@ -3830,8 +3938,16 @@ def _save(self, output_dir: str | None = None, state_dict: dict | None = None) -
                 safetensors.torch.save_file(
                     state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"}
                 )
+                unwrapped_model = self.accelerator.unwrap_model(self.model, keep_torch_compile=False)
+                if hasattr(unwrapped_model, "config") and unwrapped_model.config is not None:
+                    unwrapped_model.config.save_pretrained(output_dir)
         else:
-            self.model.save_pretrained(output_dir, state_dict=state_dict)
+            self.model.save_pretrained(
+                output_dir,
+                state_dict=state_dict,
+                safe_serialization=self.args.save_safetensors,
+                is_main_process=self.accelerator.is_main_process,
+            )
 
         if self.processing_class is not None:
             self.processing_class.save_pretrained(output_dir)
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 30377f5f5a61..78c7b5e8e5d9 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -433,6 +433,45 @@ def nested_truncate(tensors, limit):
     return tensors[:limit]
 
 
+def is_per_sample_nested(tensors) -> bool:
+    """
+    Check if tensors is a "per-sample nested structure" like tuple[list[Tensor], list[Tensor]].
+
+    This structure is used by models like Mask2Former where labels are:
+    - tuple of (mask_labels, class_labels)
+    - Each is a list of tensors, one per image
+    - Tensors may have different shapes (different instances per image)
+    """
+    if not (isinstance(tensors, tuple) and len(tensors) > 0):
+        return False
+    for t in tensors:
+        if not (isinstance(t, list) and len(t) > 0 and isinstance(t[0], (torch.Tensor, np.ndarray))):
+            return False
+    return True
+
+
+def flatten_per_sample_nested_batches(batches, num_samples):
+    """
+    Flatten a list of per-sample nested batches and truncate to num_samples.
+
+    Args:
+        batches: List of batches, each is tuple[list[Tensor], ...]
+        num_samples: Number of samples to keep
+
+    Returns:
+        Single tuple with concatenated lists, truncated to num_samples
+    """
+    if not batches:
+        return None
+    num_label_types = len(batches[0])
+    result = tuple([] for _ in range(num_label_types))
+    for batch in batches:
+        for i, label_list in enumerate(batch):
+            result[i].extend(label_list)
+    # Truncate to actual dataset size
+    return tuple(lst[:num_samples] for lst in result)
+
+
 @dataclass
 class LabelSmoother:
     """
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index 400268de8180..af97db016081 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -268,6 +268,14 @@ class TrainOutput(NamedTuple):
 
 
 def get_last_checkpoint(folder):
+    if not os.path.exists(folder):
+        return
+
+    latest_path = os.path.join(folder, "latest")
+    if os.path.isfile(latest_path):
+        with open(latest_path) as fd:
+            return os.path.join(folder, fd.read().strip())
+
     content = os.listdir(folder)
     checkpoints = [
         path
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 1a5924c723ab..07ce8295820b 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -522,8 +522,7 @@ class TrainingArguments:
             Load the best checkpoint at the end of training. Requires `eval_strategy` to be set.
             When enabled, the best checkpoint is always saved (see `save_total_limit`).
             <Tip>
-            When `True`, `save_strategy` must match `eval_strategy` (unless `save_strategy` is `"best"`), and if using `"steps"`,
-            `save_steps` must be a multiple of `eval_steps`.
+            When `True`, `save_strategy` must match `eval_strategy` (unless `save_strategy` is `"best"`).
             </Tip>
         metric_for_best_model (`str`, *optional*):
             Metric to use for comparing models when `load_best_model_at_end=True`. Must be a metric
@@ -1708,26 +1707,6 @@ def _validate_args(self):
                     '--load_best_model_at_end requires the save and eval strategy to match, except when --save_strategy="best", but found\n- Evaluation '
                     f"strategy: {self.eval_strategy}\n- Save strategy: {self.save_strategy}"
                 )
-            if self.eval_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
-                if self.eval_steps < 1 or self.save_steps < 1:
-                    if not (self.eval_steps < 1 and self.save_steps < 1):
-                        raise ValueError(
-                            "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
-                            "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps "
-                            f"{self.save_steps} and eval_steps {self.eval_steps}."
-                        )
-                    # Use integer arithmetic to avoid floating point precision issues
-                    LARGE_MULTIPLIER = 1_000_000
-                    if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0:
-                        raise ValueError(
-                            "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
-                            f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}."
-                        )
-                else:
-                    raise ValueError(
-                        "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
-                        f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
-                    )
 
         if is_torch_available():
             if self.bf16 or self.bf16_full_eval:
@@ -1775,16 +1754,22 @@ def __str__(self):
     @property
     def train_batch_size(self) -> int:
         """
-        The actual batch size for training.
+        The actual batch size for training (takes into account the number of processes and
+        the split_batches configuration).
         """
+        if hasattr(self, "accelerator_config") and self.accelerator_config.split_batches:
+            return self.per_device_train_batch_size
         train_batch_size = self.per_device_train_batch_size * max(1, self.n_gpu)
         return train_batch_size
 
     @property
     def eval_batch_size(self) -> int:
         """
-        The actual batch size for evaluation.
+        The actual batch size for evaluation (takes into account the number of processes and
+        the split_batches configuration).
         """
+        if hasattr(self, "accelerator_config") and self.accelerator_config.split_batches:
+            return self.per_device_eval_batch_size
         eval_batch_size = self.per_device_eval_batch_size * max(1, self.n_gpu)
         return eval_batch_size
 
@@ -2742,7 +2727,7 @@ def _process_fsdp_args(self):
 
         if self.fsdp_config is not None and isinstance(self.fsdp_config, dict):
             for k in list(self.fsdp_config.keys()):
-                if k.startswith("fsdp_"):
+                if k.startswith("fsdp_") and k != "fsdp_version":
                     v = self.fsdp_config.pop(k)
                     self.fsdp_config[k[5:]] = v
 
@@ -2785,6 +2770,7 @@ def _process_fsdp_args(self):
         # Build kwargs for Accelerate's FSDPPlugin
         fsdp_plugin_args = None
         if len(self.fsdp) > 0 and not self.fsdp_config["xla"]:
+            from accelerate.utils import FullyShardedDataParallelPlugin
             from accelerate.utils.constants import (
                 FSDP_AUTO_WRAP_POLICY,
                 FSDP_SHARDING_STRATEGY,
@@ -2806,7 +2792,8 @@ def _process_fsdp_args(self):
                         fsdp_plugin_args["transformer_cls_names_to_wrap"] = ",".join(
                             self.fsdp_config["transformer_layer_cls_to_wrap"]
                         )
-            fsdp_version = int(self.fsdp_config.get("version", 1))
+
+            fsdp_version = int(self.fsdp_config.get("fsdp_version", 1))
             fsdp_plugin_args["fsdp_version"] = fsdp_version
             prefetch_policy = self.fsdp_config.get("backward_prefetch", "NO_PREFETCH")
             if fsdp_version == 2:
@@ -2841,6 +2828,12 @@ def _process_fsdp_args(self):
 
             fsdp_plugin_args["sync_module_states"] = str_to_bool(sync_module_states)
 
+            # Pull allowed parameters from fsdp_config
+            ALLOWED_FSDP_PARAMS = {f.name for f in fields(FullyShardedDataParallelPlugin)}
+            for key in ALLOWED_FSDP_PARAMS:
+                if key in self.fsdp_config and key not in fsdp_plugin_args:
+                    fsdp_plugin_args[key] = self.fsdp_config[key]
+
         return fsdp_plugin_args
 
 
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index d12e0b277c1b..db69726deb4a 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -55,6 +55,7 @@
     filter_out_non_signature_kwargs,
     find_labels,
     flatten_dict,
+    is_mlx_array,
     is_numpy_array,
     is_tensor,
     is_timm_config_dict,
diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py
index 419579891e35..5b8a331349e3 100644
--- a/src/transformers/utils/auto_docstring.py
+++ b/src/transformers/utils/auto_docstring.py
@@ -19,7 +19,7 @@
 from functools import lru_cache
 from pathlib import Path
 from types import UnionType
-from typing import ClassVar, Union, get_args, get_origin
+from typing import ClassVar, Union, get_args, get_origin, get_type_hints
 
 import regex as re
 import typing_extensions
@@ -43,13 +43,17 @@
     "image_processing_pil_*.py",
     "image_processing_*.py",
     "feature_extractor_*.py",
+    "modular_*.py",
 ]
 
 PLACEHOLDER_TO_AUTO_MODULE = {
     "image_processor_class": ("image_processing_auto", "IMAGE_PROCESSOR_MAPPING_NAMES"),
     "tokenizer_class": ("tokenization_auto", "TOKENIZER_MAPPING_NAMES"),
     "video_processor_class": ("video_processing_auto", "VIDEO_PROCESSOR_MAPPING_NAMES"),
-    "feature_extractor_class": ("feature_extraction_auto", "FEATURE_EXTRACTOR_MAPPING_NAMES"),
+    "feature_extractor_class": (
+        "feature_extraction_auto",
+        "FEATURE_EXTRACTOR_MAPPING_NAMES",
+    ),
     "processor_class": ("processing_auto", "PROCESSOR_MAPPING_NAMES"),
     "config_class": ("configuration_auto", "CONFIG_MAPPING_NAMES"),
     "model_class": ("modeling_auto", "MODEL_MAPPING_NAMES"),
@@ -2732,7 +2736,9 @@ def get_model_name(obj):
             model_name_lowercase_from_file = file_name[len(start) : -len(end)]
             break
     if model_name_lowercase_from_file and model_name_lowercase_from_folder != model_name_lowercase_from_file:
-        from transformers.models.auto.configuration_auto import SPECIAL_MODEL_TYPE_TO_MODULE_NAME
+        from transformers.models.auto.configuration_auto import (
+            SPECIAL_MODEL_TYPE_TO_MODULE_NAME,
+        )
 
         if (
             model_name_lowercase_from_file in SPECIAL_MODEL_TYPE_TO_MODULE_NAME
@@ -3242,7 +3248,14 @@ def _get_parameter_info(param_name, documented_params, source_args_dict, param_t
         # Parameter is not documented
         is_documented = False
 
-    return param_type, optional_string, shape_string, additional_info, description, is_documented
+    return (
+        param_type,
+        optional_string,
+        shape_string,
+        additional_info,
+        description,
+        is_documented,
+    )
 
 
 def _process_regular_parameters(
@@ -3307,9 +3320,14 @@ def _process_regular_parameters(
         if param.default != inspect._empty and param.default is not None:
             param_default = f", defaults to `{str(param.default)}`"
 
-        param_type, optional_string, shape_string, additional_info, description, is_documented = _get_parameter_info(
-            param_name, documented_params, source_args_dict, param_type, optional
-        )
+        (
+            param_type,
+            optional_string,
+            shape_string,
+            additional_info,
+            description,
+            is_documented,
+        ) = _get_parameter_info(param_name, documented_params, source_args_dict, param_type, optional)
 
         if is_documented:
             if param_name == "config":
@@ -3336,7 +3354,7 @@ def _process_regular_parameters(
                 "type": param_type if param_type else "<fill_type>",
                 "optional": optional,
                 "shape": shape_string,
-                "description": description if description else "\n    <fill_description>",
+                "description": (description if description else "\n    <fill_description>"),
                 "default": param_default,
             }
             # Try to get the correct source file; for classes decorated with @strict (huggingface_hub),
@@ -3575,11 +3593,24 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
         for _, kwargs_param in sig.parameters.items()
         if kwargs_param.kind == inspect.Parameter.VAR_KEYWORD
     ]
+
+    try:
+        resolved_hints = get_type_hints(func)
+    except Exception:
+        resolved_hints = {}
+
     for kwarg_param in kwargs_parameters:
         # If kwargs not typed, skip
         if kwarg_param.annotation == inspect.Parameter.empty:
             continue
 
+        if isinstance(kwarg_param.annotation, str):
+            kwarg_name = next((name for name, param in sig.parameters.items() if param is kwarg_param), None)
+            resolved = resolved_hints.get(kwarg_name) if kwarg_name else None
+            if resolved is None:
+                continue
+            kwarg_param = kwarg_param.replace(annotation=resolved)
+
         if not hasattr(kwarg_param.annotation, "__args__") or not hasattr(
             kwarg_param.annotation.__args__[0], "__name__"
         ):
@@ -3629,7 +3660,10 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
                             continue
 
                         # Process each field in the custom typed kwargs
-                        for nested_param_name, nested_param_type in actual_type.__annotations__.items():
+                        for (
+                            nested_param_name,
+                            nested_param_type,
+                        ) in actual_type.__annotations__.items():
                             # Only document parameters that are explicitly documented in the TypedDict's docstring
                             if nested_param_name not in documented_nested_kwargs:
                                 continue
@@ -3699,8 +3733,19 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
                     param_default = str(getattr(parent_class, param_name, ""))
                     param_default = f", defaults to `{param_default}`" if param_default != "" else ""
 
-                param_type, optional_string, shape_string, additional_info, description, is_documented = (
-                    _get_parameter_info(param_name, documented_kwargs, source_args_dict, param_type, optional)
+                (
+                    param_type,
+                    optional_string,
+                    shape_string,
+                    additional_info,
+                    description,
+                    is_documented,
+                ) = _get_parameter_info(
+                    param_name,
+                    documented_kwargs,
+                    source_args_dict,
+                    param_type,
+                    optional,
                 )
 
                 if is_documented:
@@ -3836,7 +3881,12 @@ def _process_parameters_section(
 
     # Process **kwargs parameters if needed
     kwargs_docstring, kwargs_summary = _process_kwargs_parameters(
-        sig, func, parent_class, documented_kwargs, indent_level, undocumented_parameters
+        sig,
+        func,
+        parent_class,
+        documented_kwargs,
+        indent_level,
+        undocumented_parameters,
     )
     docstring += kwargs_docstring
 
@@ -4001,7 +4051,14 @@ def _process_returns_section(func_documentation, sig, config_class, indent_level
 
 
 def _process_example_section(
-    func_documentation, func, parent_class, class_name, model_name_lowercase, config_class, checkpoint, indent_level
+    func_documentation,
+    func,
+    parent_class,
+    class_name,
+    model_name_lowercase,
+    config_class,
+    checkpoint,
+    indent_level,
 ):
     """
     Process the example section of the docstring.
@@ -4097,7 +4154,49 @@ def _process_example_section(
     return example_docstring
 
 
-def auto_method_docstring(
+class _LazyDocClass:
+    """
+    Descriptor stored directly in ``cls.__dict__['__doc__']`` to defer class docstring
+    generation until the first ``cls.__doc__`` access.
+
+    Python's ``type.__doc__`` C-level getter checks whether the stored value has a
+    ``__get__`` method and, if so, calls it — exactly like normal descriptor dispatch.
+    This lets us intercept ``cls.__doc__`` without changing the class's metaclass.
+
+    On the first access the generator is invoked, the result is cached, and the descriptor
+    replaces itself with the plain string so that all subsequent lookups are zero-overhead.
+    """
+
+    def __init__(self, gen):
+        self._gen = gen
+        self._val = None
+
+    def __get__(self, obj, cls=None):
+        if self._val is None:
+            self._val = self._gen()
+            # Replace ourselves with the plain string so future accesses skip this
+            # descriptor entirely.
+            if cls is not None:
+                try:
+                    type.__setattr__(cls, "__doc__", self._val)
+                except (TypeError, AttributeError):
+                    pass
+        return self._val
+
+
+def _apply_lazy_doc(cls, doc_generator):
+    """
+    Store a lazy docstring generator on *cls*.
+
+    Sets ``cls.__doc__`` to a :class:`_LazyDocClass` descriptor.  Python's
+    ``type.__doc__`` C getter calls ``__get__`` on any descriptor it finds in the class
+    dict, so the generator is invoked transparently on first ``cls.__doc__`` access
+    without requiring any metaclass change.
+    """
+    cls.__doc__ = _LazyDocClass(doc_generator)
+
+
+def _generate_method_docstring(
     func,
     parent_class=None,
     custom_intro=None,
@@ -4107,16 +4206,22 @@ def auto_method_docstring(
     allowed_params=None,
 ):
     """
-    Wrapper that automatically generates docstring.
+    Pure helper that builds and returns the docstring string for *func*.
+
+    Unlike ``auto_method_docstring`` this function does **not** modify ``func`` and does
+    not return a wrapper — it simply returns the generated docstring as a ``str``.
     """
+    # Use the raw (unwrapped) function so we get the source-code docstring, not a
+    # previously auto-generated one.
+    raw_func = getattr(func, "__wrapped__", func)
 
     # Use inspect to retrieve the method's signature
-    sig = inspect.signature(func)
-    indent_level = get_indent_level(func) if not parent_class else get_indent_level(parent_class)
+    sig = inspect.signature(raw_func)
+    indent_level = get_indent_level(raw_func) if not parent_class else get_indent_level(parent_class)
 
     # Get model information
-    model_name_lowercase, class_name, config_class = _get_model_info(func, parent_class)
-    func_documentation = func.__doc__
+    model_name_lowercase, class_name, config_class = _get_model_info(raw_func, parent_class)
+    func_documentation = raw_func.__doc__
 
     if custom_args is not None and func_documentation is not None:
         func_documentation = "\n" + set_min_indent(custom_args.strip("\n"), 0) + "\n" + func_documentation
@@ -4129,13 +4234,13 @@ def auto_method_docstring(
         if not docstring.strip().endswith("\n"):
             docstring += "\n"
     else:
-        docstring = add_intro_docstring(func, class_name=class_name, indent_level=indent_level)
+        docstring = add_intro_docstring(raw_func, class_name=class_name, indent_level=indent_level)
 
     # Process Parameters section
     docstring += _process_parameters_section(
         func_documentation,
         sig,
-        func,
+        raw_func,
         class_name,
         model_name_lowercase,
         parent_class,
@@ -4153,7 +4258,7 @@ def auto_method_docstring(
     # Process Example section
     example_docstring = _process_example_section(
         func_documentation,
-        func,
+        raw_func,
         parent_class,
         class_name,
         model_name_lowercase,
@@ -4166,14 +4271,49 @@ def auto_method_docstring(
     # Format the docstring with the placeholders
     docstring = format_args_docstring(docstring, model_name_lowercase)
 
-    # Assign the dynamically generated docstring to the wrapper function
-    func.__doc__ = docstring
+    return docstring
+
+
+def auto_method_docstring(
+    func,
+    parent_class=None,
+    custom_intro=None,
+    custom_args=None,
+    checkpoint=None,
+    source_args_dict=None,
+    allowed_params=None,
+):
+    """
+    Wrapper that automatically generates a method docstring.
+
+    Methods must remain plain functions so that ``torch.compile`` / ``torch._dynamo``
+    can trace them without obstruction.  We therefore generate the docstring eagerly
+    and assign it directly to ``func.__doc__``, returning the original function
+    unchanged.  (Class-level docstrings use :class:`_LazyDocClass` instead and are
+    generated lazily on first ``cls.__doc__`` access.)
+    """
+    func.__doc__ = _generate_method_docstring(
+        func,
+        parent_class=parent_class,
+        custom_intro=custom_intro,
+        custom_args=custom_args,
+        checkpoint=checkpoint,
+        source_args_dict=source_args_dict,
+        allowed_params=allowed_params,
+    )
     return func
 
 
-def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=None):
+def _generate_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=None, _original_doc=None):
     """
-    Wrapper that automatically generates a docstring for classes based on their attributes and methods.
+    Pure helper that builds and returns the docstring string for *cls*.
+
+    Unlike ``auto_class_docstring`` this function does **not** modify *cls* and does not
+    return a wrapper — it simply returns the generated docstring as a ``str``.
+
+    *_original_doc* must be the raw source-code docstring captured **before** lazy setup so
+    that this function never calls ``cls.__doc__`` (which would recurse into the lazy
+    machinery).
     """
     # import here to avoid circular import
     from transformers.models import auto as auto_module
@@ -4185,43 +4325,43 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
     docstring_init = ""
     docstring_args = ""
     if "PreTrainedModel" in (x.__name__ for x in cls.__mro__):
-        docstring_init = auto_method_docstring(
+        docstring_init = _generate_method_docstring(
             cls.__init__, parent_class=cls, custom_args=custom_args, checkpoint=checkpoint
-        ).__doc__.replace("Args:", "Parameters:")
+        ).replace("Args:", "Parameters:")
     elif "ProcessorMixin" in (x.__name__ for x in cls.__mro__):
         is_processor = True
-        docstring_init = auto_method_docstring(
+        docstring_init = _generate_method_docstring(
             cls.__init__,
             parent_class=cls,
             custom_args=custom_args,
             checkpoint=checkpoint,
             source_args_dict=get_args_doc_from_source([ModelArgs, ImageProcessorArgs, ProcessorArgs]),
-        ).__doc__.replace("Args:", "Parameters:")
+        ).replace("Args:", "Parameters:")
     elif "ModelOutput" in (x.__name__ for x in cls.__mro__):
         # We have a data class
         is_dataclass = True
-        doc_class = cls.__doc__
+        doc_class = _original_doc
         if custom_args is None and doc_class:
             custom_args = doc_class
-        docstring_args = auto_method_docstring(
+        docstring_args = _generate_method_docstring(
             cls.__init__,
             parent_class=cls,
             custom_args=custom_args,
             checkpoint=checkpoint,
             source_args_dict=get_args_doc_from_source(ModelOutputArgs),
-        ).__doc__
+        )
     elif any("BaseImageProcessor" in x.__name__ for x in cls.__mro__):
         is_image_processor = True
-        docstring_init = auto_method_docstring(
+        docstring_init = _generate_method_docstring(
             cls.__init__,
             parent_class=cls,
             custom_args=custom_args,
             checkpoint=checkpoint,
             source_args_dict=get_args_doc_from_source(ImageProcessorArgs),
-        ).__doc__
+        )
     elif "PreTrainedConfig" in (x.__name__ for x in cls.__mro__):
         is_config = True
-        doc_class = cls.__doc__
+        doc_class = _original_doc
         if custom_args is None and doc_class:
             custom_args = doc_class
 
@@ -4237,14 +4377,14 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
                 k for k, v in getattr(ancestor, "__annotations__", {}).items() if get_origin(v) is not ClassVar
             }
         allowed_params = own_config_params if own_config_params else None
-        docstring_init = auto_method_docstring(
+        docstring_init = _generate_method_docstring(
             cls.__init__,
             parent_class=cls,
             custom_args=custom_args,
             checkpoint=checkpoint,
             source_args_dict=get_args_doc_from_source([ConfigArgs]),
             allowed_params=allowed_params,
-        ).__doc__
+        )
 
     indent_level = get_indent_level(cls)
     model_name_lowercase = get_model_name(cls)
@@ -4310,7 +4450,8 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
             # No init function, we have a data class
             docstring += docstring_args if docstring_args else "\nArgs:\n"
             source_args_dict = get_args_doc_from_source(ModelOutputArgs)
-            doc_class = cls.__doc__ if cls.__doc__ else ""
+            # Use the captured raw docstring to avoid recursing into the lazy machinery.
+            doc_class = _original_doc if _original_doc else ""
             documented_kwargs = parse_docstring(doc_class)[0]
             for param_name, param_type_annotation in cls.__annotations__.items():
                 param_type, optional = process_type_annotation(param_type_annotation, param_name)
@@ -4320,8 +4461,19 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
                 param_default = str(getattr(cls, param_name, ""))
                 param_default = f", defaults to `{param_default}`" if param_default != "" else ""
 
-                param_type, optional_string, shape_string, additional_info, description, is_documented = (
-                    _get_parameter_info(param_name, documented_kwargs, source_args_dict, param_type, optional)
+                (
+                    param_type,
+                    optional_string,
+                    shape_string,
+                    additional_info,
+                    description,
+                    is_documented,
+                ) = _get_parameter_info(
+                    param_name,
+                    documented_kwargs,
+                    source_args_dict,
+                    param_type,
+                    optional,
                 )
 
                 if is_documented:
@@ -4348,9 +4500,32 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
         print(
             f"You used `@auto_class_docstring` decorator on `{cls.__name__}` but this class is not part of the AutoMappings. Remove the decorator"
         )
-    # Assign the dynamically generated docstring to the wrapper class
-    cls.__doc__ = docstring
+        docstring = ""
+
+    return docstring
+
+
+def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=None):
+    """
+    Wrapper that automatically generates a docstring for classes lazily.
+
+    Stores a generator on *cls* that produces the full docstring on first ``cls.__doc__``
+    access rather than at decoration / import time.
+    """
+    # Capture the raw source-code docstring **before** any lazy machinery is attached so
+    # that the generator closure can use it safely without risking re-entry.
+    original_doc = cls.__dict__.get("__doc__")
+
+    def _generator():
+        return _generate_class_docstring(
+            cls,
+            custom_intro=custom_intro,
+            custom_args=custom_args,
+            checkpoint=checkpoint,
+            _original_doc=original_doc,
+        )
 
+    _apply_lazy_doc(cls, _generator)
     return cls
 
 
@@ -4363,6 +4538,18 @@ def auto_docstring(obj=None, *, custom_intro=None, custom_args=None, checkpoint=
     for common arguments (like `input_ids`, `attention_mask`, etc.), and generates complete documentation
     including examples and return value descriptions.
 
+    **Lazy generation for classes** — class docstrings are generated on the *first* access of ``cls.__doc__``,
+    not at decoration / import time.  This means the cost is paid only when documentation is actually needed
+    (e.g. when Sphinx builds the docs or ``help()`` is called), keeping import times fast.
+
+    - For **classes** the decorator stores a :class:`_LazyDocClass` descriptor in ``cls.__dict__['__doc__']``.
+      Python's ``type.__doc__`` C getter calls ``__get__`` on that descriptor transparently; no metaclass change
+      is required.  After the first access the descriptor replaces itself with the plain generated string so
+      subsequent accesses are zero-overhead.
+    - For **methods / functions** the docstring is generated eagerly at decoration time and assigned directly
+      to ``func.__doc__``.  The function itself is returned unchanged, ensuring full compatibility with
+      ``torch.compile`` / ``torch._dynamo`` and ``inspect.signature``.
+
     For complete documentation and examples, read this [guide](https://huggingface.co/docs/transformers/auto_docstring).
 
     Examples of usage:
@@ -4499,15 +4686,25 @@ class MyModelOutput(ImageClassifierOutput):
         - For model classes, the decorator derives parameter descriptions from the `__init__` method's signature
           and docstring.
         - Return value documentation is automatically generated for methods that return ModelOutput subclasses.
+        - Decorated methods remain plain functions (``inspect.isfunction`` returns ``True``) and are fully
+          compatible with ``torch.compile`` / ``torch._dynamo``.
     """
 
     def auto_docstring_decorator(obj):
         if len(obj.__qualname__.split(".")) > 1:
             return auto_method_docstring(
-                obj, custom_args=custom_args, custom_intro=custom_intro, checkpoint=checkpoint
+                obj,
+                custom_args=custom_args,
+                custom_intro=custom_intro,
+                checkpoint=checkpoint,
             )
         else:
-            return auto_class_docstring(obj, custom_args=custom_args, custom_intro=custom_intro, checkpoint=checkpoint)
+            return auto_class_docstring(
+                obj,
+                custom_args=custom_args,
+                custom_intro=custom_intro,
+                checkpoint=checkpoint,
+            )
 
     if obj:
         return auto_docstring_decorator(obj)
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index c6b5960f0849..0fc857507c0d 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -341,6 +341,12 @@ def to_py_obj(obj):
         return obj
 
 
+def _conv_out_length(input_length, kernel_size, stride, pad=0, dilation=1):
+    # 1D convolutional layer output length formula taken
+    # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+    return torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
+
+
 def to_numpy(obj):
     """
     Convert a PyTorch tensor, Numpy array or python list to a Numpy array.
@@ -703,11 +709,11 @@ def torch_float(x):
     Casts an input to a torch float32 tensor if we are in a tracing context, otherwise to a Python float.
     """
     if not _is_torch_available:
-        return int(x)
+        return float(x)
 
     import torch
 
-    return x.to(torch.float32) if torch.jit.is_tracing() and isinstance(x, torch.Tensor) else int(x)
+    return x.to(torch.float32) if torch.jit.is_tracing() and isinstance(x, torch.Tensor) else float(x)
 
 
 def filter_out_non_signature_kwargs(extra: list | None = None):
@@ -826,6 +832,7 @@ class TransformersKwargs(TypedDict, total=False):
     max_length_k: int | None
     position_ids: torch.LongTensor | None
     is_causal: bool | None
+    seq_idx: torch.IntTensor | None
 
 
 def is_timm_config_dict(config_dict: dict[str, Any]) -> bool:
@@ -897,7 +904,7 @@ def wrapper(self, *args, **kwargs):
         return_dict_passed = kwargs.pop("return_dict", return_dict)
         if return_dict_passed is not None:
             return_dict = return_dict_passed
-        output = func(self, *args, **kwargs)
+        output = func(self, *args, **kwargs, return_dict=True)
         if not return_dict and not isinstance(output, tuple):
             output = output.to_tuple()
         return output
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 616796e4fe22..e5ea687e85e1 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -42,8 +42,7 @@
     snapshot_download,
     try_to_load_from_cache,
 )
-from huggingface_hub.file_download import REGEX_COMMIT_HASH
-from huggingface_hub.utils import (
+from huggingface_hub.errors import (
     EntryNotFoundError,
     GatedRepoError,
     HfHubHTTPError,
@@ -51,6 +50,9 @@
     OfflineModeIsEnabled,
     RepositoryNotFoundError,
     RevisionNotFoundError,
+)
+from huggingface_hub.file_download import REGEX_COMMIT_HASH
+from huggingface_hub.utils import (
     build_hf_headers,
     get_session,
     hf_raise_for_status,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index de11d23cbecf..378a0fa0d20d 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -190,6 +190,40 @@ def is_torch_less_or_equal(library_version: str, accept_dev: bool = False) -> bo
         return version.parse(get_torch_version()) <= version.parse(library_version)
 
 
+@lru_cache
+def is_torch_fx_available() -> bool:
+    """
+    Backwards-compatibility shim for remote code that still imports this symbol
+    from `transformers.utils.import_utils`.
+
+    In Transformers v5+, we require PyTorch >= 2.4 where `torch.fx` is always
+    available. This function therefore simply checks that PyTorch itself is
+    available and returns True in that case.
+
+    This API is deprecated and will be removed in a future major release.
+    Remote code should stop relying on it and instead assume `torch.fx` is
+    available under the supported PyTorch versions.
+    """
+    warnings.warn(
+        "`is_torch_fx_available` is deprecated and kept only for backwards "
+        "compatibility with older `trust_remote_code` models. It now simply "
+        "checks for the presence of PyTorch >= 2.4 and always returns True "
+        "in that case.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
+    if not is_torch_available():
+        return False
+
+    try:
+        import torch.fx  # noqa: F401
+    except Exception:
+        return False
+
+    return True
+
+
 @lru_cache
 def is_torch_accelerator_available() -> bool:
     if is_torch_available():
@@ -846,7 +880,7 @@ def is_g2p_en_available() -> bool:
 
 @lru_cache
 def is_torch_neuroncore_available(check_device=True) -> bool:
-    return is_torch_xla_available() and _is_package_available("torch_neuronx")[0]
+    return is_torch_xla_available(check_is_gpu=check_device) and _is_package_available("torch_neuronx")[0]
 
 
 @lru_cache
@@ -948,7 +982,7 @@ def is_flash_attn_2_available() -> bool:
     is_available, flash_attn_version = _is_package_available("flash_attn", return_version=True)
     # FA4 is also distributed under "flash_attn", hence we need to check the naming here
     is_available = is_available and "flash-attn" in [
-        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING.get("flash_attn", [])
     ]
 
     if not is_available or not (is_torch_cuda_available() or is_torch_mlu_available()):
@@ -967,7 +1001,7 @@ def is_flash_attn_3_available() -> bool:
     is_available = _is_package_available("flash_attn_interface")[0]
     # Resolving and ensuring the proper name of FA3 being associated
     is_available = is_available and "flash-attn-3" in [
-        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING["flash_attn_interface"]
+        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING.get("flash_attn_interface", [])
     ]
     return is_available and is_torch_cuda_available()
 
@@ -979,7 +1013,7 @@ def is_flash_attn_4_available() -> bool:
     # NOTE: FA2 seems to distribute the `cute` subdirectory even if only FA2 has been installed
     #       -> check for the proper (normalized) distribution name
     is_available = is_available and "flash-attn-4" in [
-        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING.get("flash_attn", [])
     ]
 
     return is_available and is_torch_cuda_available()
@@ -990,7 +1024,7 @@ def is_flash_attn_greater_or_equal(library_version: str) -> bool:
     is_available, flash_attn_version = _is_package_available("flash_attn", return_version=True)
     # FA4 is also distributed under "flash_attn", hence we need to check the naming here
     is_available = is_available and "flash-attn" in [
-        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+        pkg.replace("_", "-") for pkg in PACKAGE_DISTRIBUTION_MAPPING.get("flash_attn", [])
     ]
 
     if not is_available:
@@ -1522,10 +1556,11 @@ def torch_compilable_check(cond: Any, msg: str | Callable[[], str], error_type:
 
     import torch
 
-    if not callable(msg):
-        # torch._check requires msg to be a callable but we want to keep the API simple for users
-        def msg_callable():
-            return msg
+    if isinstance(msg, str):
+        _msg = msg
+
+        def msg_callable() -> str:
+            return _msg
     else:
         msg_callable = msg
 
@@ -2582,13 +2617,20 @@ def wrapper(*args, **kwargs):
 BASE_FILE_REQUIREMENTS = {
     lambda name, content: "modeling_" in name: ("torch",),
     lambda name, content: "tokenization_" in name and name.endswith("_fast"): ("tokenizers",),
-    lambda name, content: "image_processing_" in name and "TorchvisionBackend" in content: (
+    lambda name, content: (
+        "image_processing_" in name and "TorchvisionBackend" in content and "image_processing_pil_" not in name
+    ): (
         "vision",
         "torch",
         "torchvision",
     ),
     lambda name, content: "image_processing_" in name: ("vision",),
-    lambda name, content: "video_processing_" in name: ("vision", "torch", "torchvision"),
+    lambda name, content: "video_processing_" in name and "video_processing_pil_" not in name: (
+        "vision",
+        "torch",
+        "torchvision",
+    ),
+    lambda name, content: "video_processing_pil_" in name: ("vision", "torch"),
 }
 
 
@@ -2634,6 +2676,13 @@ def fetch__all__(file_content) -> list[str]:
         return _all
 
 
+def _normalize_pil_backends(module_name: str, backends: tuple[str, ...]) -> tuple[str, ...]:
+    # PIL-specific processors should not require torchvision.
+    if "image_processing_pil_" in module_name or "video_processing_pil_" in module_name:
+        return tuple(backend for backend in backends if backend != "torchvision")
+    return backends
+
+
 @lru_cache
 def create_import_structure_from_path(module_path):
     """
@@ -2797,7 +2846,8 @@ def create_import_structure_from_path(module_path):
                     else:
                         backends = ()
 
-                    backends = frozenset(backends + base_requirements)
+                    backends = _normalize_pil_backends(module_name, backends + base_requirements)
+                    backends = frozenset(backends)
                     if backends not in module_requirements:
                         module_requirements[backends] = {}
                     if module_name not in module_requirements[backends]:
diff --git a/src/transformers/utils/type_validators.py b/src/transformers/utils/type_validators.py
index 08d4697683b2..0fe4a4e9eed4 100644
--- a/src/transformers/utils/type_validators.py
+++ b/src/transformers/utils/type_validators.py
@@ -132,6 +132,18 @@ def tensor_type_validator(value: str | TensorType | None = None):
         raise ValueError(f"The tensor type should be one of {possible_names} but got tensor_type={value}")
 
 
+@as_validated_field
+def dtype_validator(value: str | int | None = None):
+    # Check all possible values
+    if value is None or (is_torch_available() and isinstance(value, torch.dtype)) or isinstance(value, str):
+        pass
+    # If torch not installed in env, just pass
+    elif not is_torch_available():
+        pass
+    else:
+        raise ValueError(f"Dtype must be either an string or `torch.dtype`, but got dtype={value}")
+
+
 @as_validated_field
 def label_to_id_validation(value: str | TensorType | None = None):
     possible_names = ["pt", "np", "mlx"]
diff --git a/src/transformers/video_processing_utils.py b/src/transformers/video_processing_utils.py
index dcf4da2696c2..15902ca7f940 100644
--- a/src/transformers/video_processing_utils.py
+++ b/src/transformers/video_processing_utils.py
@@ -29,7 +29,6 @@
 from .image_utils import (
     ChannelDimension,
     SizeDict,
-    is_vision_available,
     validate_kwargs,
 )
 from .processing_utils import Unpack, VideosKwargs
@@ -67,9 +66,10 @@
 if is_torchvision_v2_available():
     import torchvision.transforms.v2.functional as tvF
 
-if is_vision_available():
+try:
     from .image_utils import PILImageResampling
-
+except Exception:
+    PILImageResampling = None
 
 logger = logging.get_logger(__name__)
 
diff --git a/test_future_annotations.py b/test_future_annotations.py
new file mode 100644
index 000000000000..d0dc5574ece9
--- /dev/null
+++ b/test_future_annotations.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+from transformers.utils.auto_docstring import _process_kwargs_parameters
+import inspect
+
+
+def test_with_future_annotations():
+    # This should fail without fix
+    def dummy_func(**kwargs: "ImagesKwargs"):
+        pass
+
+    sig = inspect.signature(dummy_func)
+    # This line should trigger the bug
+    result = _process_kwargs_parameters(sig, dummy_func, None, {}, 0, [])
+    print("Success!")
+
+
+if __name__ == "__main__":
+    test_with_future_annotations()
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/benchmarks/conftest.py b/tests/benchmarks/conftest.py
new file mode 100644
index 000000000000..521e8f1c9db5
--- /dev/null
+++ b/tests/benchmarks/conftest.py
@@ -0,0 +1,15 @@
+"""
+Conftest for benchmarks: provide a no-op ``benchmark`` fixture so that benchmark
+tests are skipped (rather than erroring) when ``pytest-benchmark`` is not installed.
+"""
+
+import pytest
+
+
+try:
+    import pytest_benchmark  # noqa: F401
+except ImportError:
+    # Provide a stub fixture that skips gracefully.
+    @pytest.fixture
+    def benchmark(request):
+        pytest.skip("pytest-benchmark not installed (pip install pytest-benchmark)")
diff --git a/tests/benchmarks/test_lazy_docstring_benchmarks.py b/tests/benchmarks/test_lazy_docstring_benchmarks.py
new file mode 100644
index 000000000000..6fa3709c92d9
--- /dev/null
+++ b/tests/benchmarks/test_lazy_docstring_benchmarks.py
@@ -0,0 +1,167 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Benchmarks for the lazy-docstring machinery introduced in ``auto_docstring.py``.
+
+Run with::
+
+    pip install pytest-benchmark
+    pytest tests/benchmarks/test_lazy_docstring_benchmarks.py -v --benchmark-only
+
+These benchmarks are **informational** — they assert nothing about absolute
+thresholds.  Use them to compare before/after performance of ``auto_docstring``
+changes, or to spot regressions in import / doc-access paths.
+"""
+
+import importlib
+import sys
+
+import pytest
+
+
+try:
+    import pytest_benchmark  # noqa: F401
+
+    HAS_BENCHMARK = True
+except ImportError:
+    HAS_BENCHMARK = False
+
+pytestmark = pytest.mark.skipif(
+    not HAS_BENCHMARK, reason="pytest-benchmark not installed (pip install pytest-benchmark)"
+)
+
+
+# ---------------------------------------------------------------------------
+# 1. Module import time
+# ---------------------------------------------------------------------------
+
+
+def _do_import_image_processing():
+    """Re-import ``image_processing_utils`` from scratch each round."""
+    sys.modules.pop("transformers.image_processing_utils", None)
+    importlib.import_module("transformers.image_processing_utils")
+
+
+@pytest.mark.benchmark(group="import")
+def test_import_image_processing(benchmark):
+    """Measure how long it takes to import ``transformers.image_processing_utils``.
+
+    A significant portion of this time used to be docstring generation; with the
+    lazy approach that cost is deferred until ``__doc__`` is first accessed.
+    """
+    # Warm-up: ensure everything except the target module is already cached.
+    import transformers.image_processing_utils  # noqa: F401
+
+    benchmark(_do_import_image_processing)
+
+
+# ---------------------------------------------------------------------------
+# 2. Class ``__doc__`` access — first (generates) vs cached
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark(group="doc_access")
+def test_class_doc_first_access(benchmark):
+    """Measure the cost of the *first* ``cls.__doc__`` access (triggers generation).
+
+    Because ``_LazyDocClass.__get__`` replaces itself with a plain string after the
+    first call, subsequent benchmarks in this process will measure the cached path.
+    Run with ``--benchmark-disable-gc`` for reproducible timings.
+    """
+    from transformers.image_processing_utils import BaseImageProcessor
+
+    # Reset the lazy state so every round re-generates.
+    from transformers.utils.auto_docstring import auto_class_docstring
+
+    def setup():
+        auto_class_docstring(BaseImageProcessor)
+
+    def access():
+        return BaseImageProcessor.__doc__
+
+    benchmark.pedantic(access, setup=setup, rounds=10, iterations=1)
+
+
+@pytest.mark.benchmark(group="doc_access")
+def test_class_doc_cached_access(benchmark):
+    """Measure the cost of accessing ``cls.__doc__`` after it has been generated.
+
+    After the first access the lazy descriptor replaces itself with a plain string,
+    so this path should be essentially free.
+    """
+    from transformers.image_processing_utils import BaseImageProcessor
+
+    # Ensure doc is already generated (cached).
+    _ = BaseImageProcessor.__doc__
+
+    benchmark(lambda: BaseImageProcessor.__doc__)
+
+
+# ---------------------------------------------------------------------------
+# 3. Method ``__doc__`` access
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark(group="doc_access")
+def test_method_doc_access(benchmark):
+    """Measure ``method.__doc__`` access cost after eager decoration.
+
+    Methods are decorated eagerly (``func.__doc__`` is set at decoration time and
+    the original function is returned unchanged).  Subsequent reads are a plain
+    attribute lookup — essentially free.
+    """
+    from transformers.utils.auto_docstring import auto_method_docstring
+
+    def _dummy(x: int, y: int = 0) -> int:
+        r"""x (`int`): First number.\ny (`int`, *optional*): Second number."""
+        return x + y
+
+    _dummy.__qualname__ = "DummyClass.forward"  # appear as a method to auto_method_docstring
+    auto_method_docstring(_dummy)
+
+    benchmark(lambda: _dummy.__doc__)
+
+
+# ---------------------------------------------------------------------------
+# 4. ``from_pretrained`` with a tiny model (end-to-end smoke benchmark)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark(group="from_pretrained")
+@pytest.mark.slow
+def test_from_pretrained_tiny_llama(benchmark):
+    """Measure ``LlamaForCausalLM.from_pretrained`` on a tiny random model.
+
+    This is a *slow* benchmark (marked with ``@pytest.mark.slow``) that requires
+    network access and PyTorch.  It is skipped by default unless ``RUN_SLOW=1``
+    is set.  Run with::
+
+        RUN_SLOW=1 pytest tests/benchmarks/test_lazy_docstring_benchmarks.py \
+            -k test_from_pretrained_tiny_llama -v --benchmark-only
+    """
+    import os
+
+    if not os.environ.get("RUN_SLOW"):
+        pytest.skip("Set RUN_SLOW=1 to run this benchmark")
+
+    try:
+        from transformers import LlamaForCausalLM
+    except ImportError:
+        pytest.skip("PyTorch is required for this benchmark")
+
+    benchmark(
+        LlamaForCausalLM.from_pretrained,
+        "hf-internal-testing/tiny-random-LlamaForCausalLM",
+        low_cpu_mem_usage=False,
+    )
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index 83f170a4d555..c4b5636a618c 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -624,6 +624,11 @@ def test_eta_dist_warper(self):
         # first batch should keep 2 tokens, second batch would keep only 1, but due to `min_tokens_to_keep=2` keeps 2.
         self.assertListEqual((filtered_dist != 0.0).to(torch.long).sum(dim=-1).tolist(), [2, 2])
 
+        # eta warper should fail fast when a previous processor fully masked a row.
+        fully_masked_scores = torch.full((1, vocab_size), -float("inf"), device=torch_device, dtype=torch.float)
+        with self.assertRaisesRegex(ValueError, "all logits set to -inf"):
+            eta_warp(input_ids, fully_masked_scores)
+
     def test_no_repeat_ngram_dist_processor(self):
         vocab_size = 3
         batch_size = 2
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 15df7036eb35..f272b7c344c8 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2893,6 +2893,35 @@ def emit(self, record):
         finally:
             logger.removeHandler(warningHandler)
 
+    def test_inputs_embeds_warn_without_ids_for_token_based_processors(self):
+        model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device).eval()
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
+        inputs = tokenizer("Hello world", return_tensors="pt").to(torch_device)
+        embeds = model.get_input_embeddings()(inputs["input_ids"])
+
+        outputs_without_penalty = model.generate(inputs_embeds=embeds, max_new_tokens=5, repetition_penalty=1.0)
+        self.assertEqual(outputs_without_penalty.shape[0], inputs["input_ids"].shape[0])
+
+        with self.assertWarnsRegex(UserWarning, "repetition_penalty"):
+            outputs_with_ignored_penalty = model.generate(
+                inputs_embeds=embeds, max_new_tokens=5, repetition_penalty=1.1
+            )
+        self.assertEqual(outputs_with_ignored_penalty.shape[0], inputs["input_ids"].shape[0])
+
+        with self.assertWarnsRegex(UserWarning, "no_repeat_ngram_size"):
+            outputs_with_ignored_ngram = model.generate(inputs_embeds=embeds, max_new_tokens=5, no_repeat_ngram_size=2)
+        self.assertEqual(outputs_with_ignored_ngram.shape[0], inputs["input_ids"].shape[0])
+
+        outputs = model.generate(
+            input_ids=inputs["input_ids"],
+            inputs_embeds=embeds,
+            attention_mask=inputs.get("attention_mask"),
+            max_new_tokens=5,
+            repetition_penalty=1.1,
+            no_repeat_ngram_size=2,
+        )
+        self.assertEqual(outputs.shape[0], inputs["input_ids"].shape[0])
+
     @slow
     def test_beam_search_early_stop_heuristic(self):
         """Regression test for #38778 (early stopping needs to be tracked at a batch level)"""
diff --git a/tests/kernels/test_kernels.py b/tests/kernels/test_kernels.py
index 1bd9a7c79792..fdbdf066198a 100644
--- a/tests/kernels/test_kernels.py
+++ b/tests/kernels/test_kernels.py
@@ -419,6 +419,48 @@ def my_attention(*args, **kwargs):
             except Exception as e:
                 print(f"Could not clean up `ALL_MASK_ATTENTION_FUNCTIONS`: {e}")
 
+    def test_kernel_mask_function_default(self):
+        """Kernels without MASK_FUNCTION attribute should default to flash_attention_2 mask."""
+        kernel_obj = types.SimpleNamespace(my_func=lambda *a, **k: None)
+        with patch("transformers.integrations.hub_kernels.get_kernel", return_value=kernel_obj):
+            attn_impl = "org/default-mask:my_func"
+            load_and_register_attn_kernel(attn_impl)
+            self.assertIn(attn_impl, ALL_MASK_ATTENTION_FUNCTIONS.valid_keys())
+            self.assertEqual(
+                ALL_MASK_ATTENTION_FUNCTIONS[attn_impl],
+                ALL_MASK_ATTENTION_FUNCTIONS["flash_attention_2"],
+            )
+            # Cleanup registration to avoid leaking functions across tests
+            try:
+                ALL_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception as e:
+                print(f"Could not clean up `ALL_ATTENTION_FUNCTIONS`: {e}")
+            try:
+                ALL_MASK_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception as e:
+                print(f"Could not clean up `ALL_MASK_ATTENTION_FUNCTIONS`: {e}")
+
+    def test_kernel_mask_function_custom(self):
+        """Kernels with MASK_FUNCTION attribute should use the declared mask type."""
+        kernel_obj = types.SimpleNamespace(my_func=lambda *a, **k: None, MASK_FUNCTION="sdpa")
+        with patch("transformers.integrations.hub_kernels.get_kernel", return_value=kernel_obj):
+            attn_impl = "org/custom-mask:my_func"
+            load_and_register_attn_kernel(attn_impl)
+            self.assertIn(attn_impl, ALL_MASK_ATTENTION_FUNCTIONS.valid_keys())
+            self.assertEqual(
+                ALL_MASK_ATTENTION_FUNCTIONS[attn_impl],
+                ALL_MASK_ATTENTION_FUNCTIONS["sdpa"],
+            )
+            # Cleanup registration to avoid leaking functions across tests
+            try:
+                ALL_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception as e:
+                print(f"Could not clean up `ALL_ATTENTION_FUNCTIONS`: {e}")
+            try:
+                ALL_MASK_ATTENTION_FUNCTIONS.pop(attn_impl, None)
+            except Exception as e:
+                print(f"Could not clean up `ALL_MASK_ATTENTION_FUNCTIONS`: {e}")
+
 
 @require_kernels
 class TestUseKernelsLifecycle(TestCasePlus):
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index bf849c031e3a..ecec7a7bb9ae 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -290,6 +290,8 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (AltCLIPTextModel,) if is_torch_available() else ()
+    # AltCLIPTextModel has large embeddings relative to model size, so we need higher split percentages
+    model_split_percents = [0.5, 0.8, 0.9]
 
     # TODO (@SunMarc): Fix me
     @unittest.skip(reason="It's broken.")
diff --git a/tests/models/auto/test_image_processing_auto.py b/tests/models/auto/test_image_processing_auto.py
index 886292830678..ec243b07cc48 100644
--- a/tests/models/auto/test_image_processing_auto.py
+++ b/tests/models/auto/test_image_processing_auto.py
@@ -18,6 +18,7 @@
 import tempfile
 import unittest
 from pathlib import Path
+from unittest.mock import patch
 
 import transformers
 from transformers import (
@@ -291,6 +292,17 @@ def test_backend_kwarg_pil(self):
             image_processor = AutoImageProcessor.from_pretrained(tmpdirname, backend="pil")
             self.assertIsInstance(image_processor, ViTImageProcessorPil)
 
+    @require_vision
+    def test_auto_backend_falls_back_to_pil_when_torchvision_is_unavailable(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            processor_tmpfile = Path(tmpdirname) / "preprocessor_config.json"
+            json.dump({"image_processor_type": "Gemma3ImageProcessor"}, open(processor_tmpfile, "w"))
+
+            with patch("transformers.models.auto.image_processing_auto.is_torchvision_available", return_value=False):
+                image_processor = AutoImageProcessor.from_pretrained(tmpdirname)
+
+        self.assertEqual(type(image_processor).__name__, "Gemma3ImageProcessorPil")
+
     @require_torchvision
     def test_backend_kwarg_torchvision(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index c029ae2cf97d..a8185b55597a 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -498,6 +498,46 @@ def __init__(self, tokenizer, decoder_tokenizer, image_processor):
             # Verify image processor loaded correctly
             self.assertEqual(loaded_processor.image_processor.size, image_processor.size)
 
+    def test_processor_from_pretrained_with_prebuilt_tokenizer_kwarg(self):
+        class SingleTokenizerProcessor(ProcessorMixin):
+            def __init__(self, bpe_tokenizer):
+                super().__init__(bpe_tokenizer)
+
+        class DualTokenizerProcessor(ProcessorMixin):
+            def __init__(self, bpe_tokenizer, decoder_tokenizer):
+                super().__init__(bpe_tokenizer, decoder_tokenizer)
+
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM")
+
+        self.assertEqual(
+            SingleTokenizerProcessor._pop_prebuilt_subprocessors({"tokenizer": tokenizer}),
+            {"bpe_tokenizer": tokenizer},
+        )
+        ambiguous_kwargs = {"tokenizer": tokenizer}
+        self.assertEqual(DualTokenizerProcessor._pop_prebuilt_subprocessors(ambiguous_kwargs), {})
+        self.assertIn("tokenizer", ambiguous_kwargs)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            SingleTokenizerProcessor(bpe_tokenizer=tokenizer).save_pretrained(tmp_dir)
+
+            loaded = SingleTokenizerProcessor.from_pretrained(tmp_dir, bpe_tokenizer=tokenizer)
+            self.assertIs(loaded.bpe_tokenizer, tokenizer)
+
+            loaded = SingleTokenizerProcessor.from_pretrained(tmp_dir, tokenizer=tokenizer)
+            self.assertIs(loaded.bpe_tokenizer, tokenizer)
+
+            loaded, unused = SingleTokenizerProcessor.from_pretrained(
+                tmp_dir, tokenizer=tokenizer, return_unused_kwargs=True
+            )
+            self.assertIs(loaded.bpe_tokenizer, tokenizer)
+            self.assertNotIn("tokenizer", unused)
+
+            loaded, unused = SingleTokenizerProcessor.from_pretrained(
+                tmp_dir, bpe_tokenizer=tokenizer, return_unused_kwargs=True
+            )
+            self.assertIs(loaded.bpe_tokenizer, tokenizer)
+            self.assertNotIn("bpe_tokenizer", unused)
+
     def test_processor_with_multiple_image_processors_save_load(self):
         """Test that processors with multiple image processors save and load correctly."""
 
diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
index 978ce845c75a..7d7c4fd6faf7 100644
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@@ -337,6 +337,27 @@ def test_auto_tokenizer_from_mistral_patching(self):
             "mistralai/Ministral-3-3B-Instruct-2512", fix_mistral_regex=True
         )  # should not error
 
+    @require_tokenizers
+    def test_auto_tokenizer_mistral_patching_applies_pretokenizer(self):
+        """Verify fix_mistral_regex=True actually patches the pre_tokenizer without AttributeError."""
+        import tokenizers
+
+        tokenizer = AutoTokenizer.from_pretrained("mistralai/Ministral-3-3B-Instruct-2512")
+        # Create a temp config with an old transformers_version so the patching code path is exercised
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config_path = os.path.join(tmp_dir, "config.json")
+            with open(config_path, "w", encoding="utf-8") as f:
+                json.dump({"model_type": "mistral", "transformers_version": "4.50.0"}, f)
+
+            patched = TokenizersBackend._patch_mistral_regex(
+                tokenizer._tokenizer,
+                tmp_dir,
+                is_local=True,
+                fix_mistral_regex=True,
+            )
+        self.assertTrue(getattr(patched, "fix_mistral_regex", False))
+        self.assertIsInstance(patched.pre_tokenizer, tokenizers.pre_tokenizers.Sequence)
+
     @require_tokenizers
     def test_auto_tokenizer_loads_bloom_repo_without_tokenizer_class(self):
         tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-BloomForCausalLM")
diff --git a/tests/models/auto/test_video_processing_auto.py b/tests/models/auto/test_video_processing_auto.py
index baccddbdc652..ec4d3a2024b1 100644
--- a/tests/models/auto/test_video_processing_auto.py
+++ b/tests/models/auto/test_video_processing_auto.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 import json
 import sys
 import tempfile
 import unittest
 from pathlib import Path
+from unittest.mock import patch
 
 import transformers
 from transformers import (
@@ -146,6 +148,12 @@ def test_video_processor_not_found(self):
         ):
             _ = AutoVideoProcessor.from_pretrained("hf-internal-testing/config-no-model")
 
+    def test_video_processor_class_from_name_with_none_mapping_entry(self):
+        video_processing_auto = importlib.import_module("transformers.models.auto.video_processing_auto")
+
+        with patch.dict(video_processing_auto.VIDEO_PROCESSOR_MAPPING_NAMES, {"videomae": None}, clear=True):
+            self.assertIsNone(video_processing_auto.video_processor_class_from_name("DefinitelyMissingVideoProcessor"))
+
     def test_from_pretrained_dynamic_video_processor(self):
         # If remote code is not set, we will time out when asking whether to load the model.
         with self.assertRaises(ValueError):
diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py
index c88f6889d123..b5e659826a21 100644
--- a/tests/models/aya_vision/test_modeling_aya_vision.py
+++ b/tests/models/aya_vision/test_modeling_aya_vision.py
@@ -206,9 +206,6 @@ def test_sdpa_can_compile_dynamic(self):
     def test_batching_equivalence(self):
         pass
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 class AyaVisionIntegrationTest(unittest.TestCase):
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 2583b8988a54..cd45e3c4b7e7 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -314,6 +314,8 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (ChineseCLIPTextModel,) if is_torch_available() else ()
+    # ChineseCLIPTextModel has large embeddings relative to model size, so we need higher split percentages
+    model_split_percents = [0.5, 0.8, 0.9]
 
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/colpali/test_modeling_colpali.py b/tests/models/colpali/test_modeling_colpali.py
index 88df8ade9a49..699d5019fbe1 100644
--- a/tests/models/colpali/test_modeling_colpali.py
+++ b/tests/models/colpali/test_modeling_colpali.py
@@ -226,10 +226,6 @@ def test_sdpa_can_dispatch_on_flash(self):
     def test_sdpa_can_compile_dynamic(self):
         pass
 
-    @unittest.skip(reason="Some weight mappings from paligemma are unreachable here as they use a `^` pattern")
-    def test_reverse_loading_mapping(self):
-        pass
-
 
 @require_torch
 class ColPaliModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/colqwen2/test_modeling_colqwen2.py b/tests/models/colqwen2/test_modeling_colqwen2.py
index 110576ebe5c6..fb213177fb8b 100644
--- a/tests/models/colqwen2/test_modeling_colqwen2.py
+++ b/tests/models/colqwen2/test_modeling_colqwen2.py
@@ -284,10 +284,6 @@ def test_sdpa_can_compile_dynamic(self):
     def test_load_save_without_tied_weights(self):
         pass
 
-    @unittest.skip(reason="One weight renaming from qwen2 is unreachable here as it uses a `^` pattern")
-    def test_reverse_loading_mapping(self):
-        pass
-
 
 @require_torch
 class ColQwen2ModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/dac/test_modeling_dac.py b/tests/models/dac/test_modeling_dac.py
index 8b4de999a7d4..798b50e6c07e 100644
--- a/tests/models/dac/test_modeling_dac.py
+++ b/tests/models/dac/test_modeling_dac.py
@@ -309,12 +309,17 @@ def compute_rmse(arr1, arr2):
 - test_batch: https://gist.github.com/ebezzam/bb315efa7a416db6336a6b2a2d424ffa#file-test_dac_batch-py
 NOTE (ebezzam): had to run reproducers from CI for expected outputs to match, cf PR which modified CI torch settings: https://github.com/huggingface/transformers/pull/39885
 
-See https://github.com/huggingface/transformers/pull/39313 for reason behind large tolerance between for encoder
-and decoder outputs (1e-3). In summary, original model uses weight normalization, while Transformers does not. This
-leads to accumulating error. However, this does not affect the quantizer codes, thanks to discretization being
-robust to precision errors. Moreover, codec error is similar between Transformers and original.
+Higher tolerances for encoder and decoder outputs are expected due to:
+1. Transformer model does not use weight norm for speed-up. And during model conversion, weight norm was removed on
+CPU. This leads to slightly different weight (1e-8) and the error accumulates. Removing weight norm on GPU would produce
+equivalent weights.
+2. Original version uses Snake1D activation with JIT: https://github.com/descriptinc/descript-audio-codec/blob/c7cfc5d2647e26471dc394f95846a0830e7bec34/dac/nn/layers.py#L18
+Transformer version does not use JIT, so outputs are slightly different.
 
-Moreover, here is a script to debug outputs and weights layer-by-layer:
+Nevertheless, quantizer codes are less affected, thanks to discretization being robust to precision errors and it does
+not use Snake1D activations. Moreover, codec error is similar between Transformers and original.
+
+Here is a script to debug outputs and weights layer-by-layer:
 https://gist.github.com/ebezzam/bb315efa7a416db6336a6b2a2d424ffa#file-dac_layer_by_layer_debugging-py
 """
 
diff --git a/tests/models/emu3/test_modeling_emu3.py b/tests/models/emu3/test_modeling_emu3.py
index 4fa55987c70f..6b356bb659c9 100644
--- a/tests/models/emu3/test_modeling_emu3.py
+++ b/tests/models/emu3/test_modeling_emu3.py
@@ -340,9 +340,6 @@ def _image_features_get_expected_num_hidden_states(self, model_tester=None):
         up_down_blocks = len(model_tester.vq_channel_multiplier) * model_tester.vq_num_res_blocks
         return up_down_blocks + 2 + model_tester.vq_num_res_blocks + 1
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 class Emu3IntegrationTest(unittest.TestCase):
diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
index a08cbc199692..43a5982e3683 100644
--- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
@@ -15,6 +15,7 @@
 
 import tempfile
 import unittest
+import warnings
 
 from transformers import is_torch_available, logging
 from transformers.testing_utils import (
@@ -365,6 +366,59 @@ def check_encoder_decoder_model_labels(
             outputs_encoder_decoder["encoder_last_hidden_state"].shape, (input_ids.shape + (config.hidden_size,))
         )
 
+    def check_encoder_decoder_model_warning(
+        self,
+        config,
+        input_ids,
+        attention_mask,
+        encoder_hidden_states,
+        decoder_config,
+        decoder_input_ids,
+        decoder_attention_mask,
+        labels,
+        **kwargs,
+    ):
+        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
+        enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
+        enc_dec_model.to(torch_device)
+
+        # Test that only one warning is raised when only labels are provided
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            # Set decoder_start_token_id 0 because the tokenizer.cls_token_id can't be accessed from here
+            enc_dec_model.config.decoder_start_token_id = 0
+            enc_dec_model.config.pad_token_id = decoder_config.pad_token_id
+            enc_dec_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                labels=labels,
+            )
+
+            self.assertEqual(len(w), 1)
+            self.assertIn(
+                "Version v4.12.0 introduces a better way to train encoder-decoder models by computing the loss",
+                str(w[0].message),
+            )
+
+        # Test that two warnings are raised when both labels and decoder_input_ids are provided
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            enc_dec_model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                decoder_attention_mask=decoder_attention_mask,
+                labels=labels,
+            )
+
+            self.assertEqual(len(w), 2)
+            self.assertIn("The decoder_input_ids are created based on the labels", str(w[0].message))
+            self.assertIn(
+                "Version v4.12.0 introduces a better way to train encoder-decoder models by computing the loss",
+                str(w[1].message),
+            )
+
     def _check_output_with_attentions(
         self, outputs_encoder_decoder, config, input_ids, decoder_config, decoder_input_ids
     ):
@@ -541,6 +595,10 @@ def test_encoder_decoder_model_labels(self):
         input_ids_dict = self.prepare_config_and_inputs()
         self.check_encoder_decoder_model_labels(**input_ids_dict)
 
+    def test_encoder_decoder_model_warning(self):
+        input_ids_dict = self.prepare_config_and_inputs()
+        self.check_encoder_decoder_model_warning(**input_ids_dict)
+
     def test_encoder_decoder_model_output_attentions(self):
         input_ids_dict = self.prepare_config_and_inputs()
         self.check_encoder_decoder_model_output_attentions(**input_ids_dict)
diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py
index 6c0a2657c4f7..6f06c8d5f68d 100644
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -264,9 +264,6 @@ def test_get_image_features_hidden_states(self):
     def test_get_image_features_attentions(self):
         pass
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @slow
 @require_torch_accelerator
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
index 913d6b9cf5ff..288c41eed6fb 100644
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -427,9 +427,6 @@ def test_flash_attn_3_from_config(self):
     def test_flash_attn_4_from_config(self):
         self.flash_attn_from_config(attn_implementation="flash_attention_4", test_fwd_in_train=False)
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @slow
 @require_torch_accelerator
diff --git a/tests/models/gemma4/test_modeling_gemma4.py b/tests/models/gemma4/test_modeling_gemma4.py
index 9d3924d13935..d27c4260b200 100644
--- a/tests/models/gemma4/test_modeling_gemma4.py
+++ b/tests/models/gemma4/test_modeling_gemma4.py
@@ -17,6 +17,7 @@
 
 import pytest
 from parameterized import parameterized
+from pytest import mark
 
 from transformers import (
     AutoTokenizer,
@@ -27,8 +28,13 @@
 from transformers.testing_utils import (
     Expectations,
     cleanup,
+    require_deterministic_for_xpu,
+    require_flash_attn,
+    require_flash_attn_3,
+    require_flash_attn_4,
     require_torch,
     require_torch_accelerator,
+    require_torch_gpu,
     require_torch_multi_gpu,
     slow,
     torch_device,
@@ -126,6 +132,20 @@ def test_tp_generation_quantized(self):
     def test_model_training(self):
         pass
 
+    @unittest.skip(
+        "Under non-bf16 dtypes, MoE grouped_mm falls back to "
+        "_grouped_mm_fallback_backward which is incompatible with torch.compile."
+    )
+    def test_flash_attn_2_can_compile_with_attention_mask_None_without_graph_break(self):
+        pass
+
+    @unittest.skip(
+        "Under non-bf16 dtypes, MoE grouped_mm falls back to "
+        "_grouped_mm_fallback_backward which is incompatible with torch.compile."
+    )
+    def test_torch_compile_for_training(self):
+        pass
+
 
 class Gemma4Audio2TextModelTester:
     def __init__(
@@ -470,6 +490,54 @@ def test_num_layers_is_small(self):
     def test_generate_from_random_inputs_embeds(self):
         pass
 
+    @require_flash_attn
+    @require_torch_accelerator
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_from_config(self):
+        # Gemma4 requires mm_token_type_ids in train mode, so we test in eval mode
+        self.flash_attn_from_config(attn_implementation="flash_attention_2", test_fwd_in_train=False)
+
+    @require_flash_attn_3
+    @require_torch_gpu
+    @mark.flash_attn_3_test
+    @slow
+    def test_flash_attn_3_from_config(self):
+        # Gemma4 requires mm_token_type_ids in train mode, so we test in eval mode
+        self.flash_attn_from_config(attn_implementation="flash_attention_3", test_fwd_in_train=False)
+
+    @require_flash_attn_4
+    @require_torch_gpu
+    @mark.flash_attn_4_test
+    @slow
+    def test_flash_attn_4_from_config(self):
+        # Gemma4 requires mm_token_type_ids in train mode, so we test in eval mode
+        self.flash_attn_from_config(attn_implementation="flash_attention_4", test_fwd_in_train=False)
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_2_inference_equivalence(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_3_inference_equivalence(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_3_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_4_inference_equivalence(self):
+        pass
+
+    @unittest.skip("The base test does not pass image_position_ids and mm_token_type_ids required by Gemma4")
+    def test_flash_attn_4_inference_equivalence_right_padding(self):
+        pass
+
     @unittest.skip(
         "Randomly starts failing after module order changed in the __init__ because accelertate is not robust enough"
     )
@@ -516,6 +584,7 @@ def setUp(self):
     def tearDown(self):
         cleanup(torch_device, gc_collect=True)
 
+    @require_deterministic_for_xpu
     def test_model_with_image(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
@@ -534,11 +603,13 @@ def test_model_with_image(self):
         EXPECTED_TEXTS = Expectations(
             {
                 ("cuda", 8): ['This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background'],
+                ("xpu", 3): ['This image shows a **brown and white cow standing on a sandy beach near the ocean**.\n\nHere are some details about the image:\n\n*   '],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
+    @require_deterministic_for_xpu
     def test_model_with_image_batch(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
@@ -580,11 +651,16 @@ def test_model_with_image_batch(self):
                     "This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background",
                     "No, these images are not identical.\n\nThe first image is a photograph of a **brown and white cow standing on a beach** under a blue",
                 ],
+                ("xpu", 3): [
+                    "This image shows a **brown and white cow** standing on a **sandy beach** with the **ocean and a blue sky** in the background",
+                    "No, these images are not identical.\n\nThe first image is a photograph of a **brown and white cow standing on a beach** under a blue",
+                ],
             }
         )
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
+    @require_deterministic_for_xpu
     def test_model_multiimage(self):
         model = Gemma4ForConditionalGeneration.from_pretrained(self.model_name, device_map=torch_device)
 
@@ -614,6 +690,7 @@ def test_model_multiimage(self):
         EXPECTED_TEXTS = Expectations(
             {
                 ("cuda", 8): ['Based on the image, here is a description of what I see:\n\n**Foreground & Street Scene:**\n* **Traffic Sign:** The most prominent'],
+                ("xpu", 3): ['Based on the image, here is a description of what I see:\n\n**Foreground & Street Scene:**\n* **Roadway:** There is an'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@@ -647,6 +724,7 @@ def test_model_text_only_multigpu(self):
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
         self.assertEqual(output_text, EXPECTED_TEXT)
 
+    @require_deterministic_for_xpu
     def test_model_text_only(self):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map=torch_device)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, padding_side="left")
@@ -666,6 +744,7 @@ def test_model_text_only(self):
             {
                 ("cuda", (8, 0)): ['## The Algorithmic Mind\n\nA whisper starts, a seed unseen,\nOf data vast, a vibrant sheen.\nA sea of numbers,'],
                 ("cuda", (8, 6)): ['## The Algorithmic Mind\n\nA tapestry of data, vast and deep,\nWhere silent numbers in their slumber sleep.\nA sea of text'],
+                ("xpu", 3): ['## The Algorithmic Mind\n\nA whisper starts in silicon deep,\nWhere data streams in endless sweep.\nNo flesh and blood, no beating'],
             }
         )  # fmt: skip
         EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
@@ -696,6 +775,7 @@ def test_states_sharing_with_and_without_cache(self):
 
     # Note: we do not test FA2 as the head dim is 512 on some layers, which is not compatible with the kernels
     @parameterized.expand([("sdpa",), ("eager",)])
+    @require_deterministic_for_xpu
     def test_generation_beyond_sliding_window(self, attn_implementation: str):
         """Test that we can correctly generate beyond the sliding window. Outputs for every attention functions
         should be coherent and identical.
@@ -734,7 +814,11 @@ def test_generation_beyond_sliding_window(self, attn_implementation: str):
                 ("cuda", 8): [
                     "That sounds lovely! It seems like you're really enjoying the place you'",
                     "Here are a few ways you could use or expand upon that list, depending on",
-                ]
+                ],
+                ("xpu", 3): [
+                    "That sounds lovely! It seems like you're really enjoying the place you'",
+                    "Here are a few ways you could use or expand upon that list, depending on",
+                ],
             }
         )
         self.assertEqual(output_text, EXPECTED_COMPLETIONS.get_expectation())
diff --git a/tests/models/got_ocr2/test_modeling_got_ocr2.py b/tests/models/got_ocr2/test_modeling_got_ocr2.py
index 404b88d08dde..0c5dbde9b68b 100644
--- a/tests/models/got_ocr2/test_modeling_got_ocr2.py
+++ b/tests/models/got_ocr2/test_modeling_got_ocr2.py
@@ -161,9 +161,6 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 class GotOcr2IntegrationTest(unittest.TestCase):
diff --git a/tests/models/internvl/test_modeling_internvl.py b/tests/models/internvl/test_modeling_internvl.py
index c9b8d06ba9fa..190f2f02a99e 100644
--- a/tests/models/internvl/test_modeling_internvl.py
+++ b/tests/models/internvl/test_modeling_internvl.py
@@ -214,9 +214,6 @@ def test_sdpa_can_compile_dynamic(self):
     def test_flash_attn_2_fp32_ln(self):
         pass
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @slow
 @require_torch_accelerator
diff --git a/tests/models/internvl/test_processing_internvl.py b/tests/models/internvl/test_processing_internvl.py
index 1432f769a1d3..8bc2d535ab0a 100644
--- a/tests/models/internvl/test_processing_internvl.py
+++ b/tests/models/internvl/test_processing_internvl.py
@@ -330,14 +330,14 @@ def _test_apply_chat_template(
         for idx, url in enumerate(input_data[:batch_size]):
             batch_messages[idx][0]["content"] = [batch_messages[idx][0]["content"][0], {"type": modality, "url": url}]
 
-        num_frames = 2  # by default no more than 2 frames, otherwise too slow
+        num_frames_per_video = 2  # by default no more than 2 frames, otherwise too slow
         out_dict = processor.apply_chat_template(
             batch_messages,
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
-            num_frames=num_frames,
+            num_frames=num_frames_per_video,
         )
         self.assertTrue(self.videos_input_name in out_dict)
         self.assertEqual(len(out_dict["input_ids"]), batch_size)
@@ -352,7 +352,7 @@ def _test_apply_chat_template(
                     if (content_type := content.get("type")) == "image":
                         num_pixel_planes += 1
                     elif content_type == "video":
-                        num_pixel_planes += num_frames
+                        num_pixel_planes += num_frames_per_video
         self.assertEqual(len(out_dict[self.videos_input_name]), num_pixel_planes)
         for k in out_dict:
             self.assertIsInstance(out_dict[k], torch.Tensor)
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 4e50c56eb55b..b0bcf5afbbbd 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -282,9 +282,6 @@ def test_training_gradient_checkpointing_use_reentrant_true(self):
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 @slow
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index a5bd146fcc6d..7e7b40e4eaba 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -125,9 +125,6 @@ def test_training_gradient_checkpointing_use_reentrant_true(self):
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index 33f3efa69a64..f506d9685bb1 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -340,9 +340,6 @@ def _video_features_prepare_config_and_inputs(self):
         inputs_dict = {"pixel_values": inputs_dict["pixel_values_videos"]}
         return config, inputs_dict
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py
index 6de0193f03a9..bf955f6e0816 100644
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -312,9 +312,6 @@ def _video_features_prepare_config_and_inputs(self):
         inputs_dict = {"pixel_values": pixel_values_videos}
         return config, inputs_dict
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 class LlavaOnevisionForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/mistral3/test_modeling_mistral3.py b/tests/models/mistral3/test_modeling_mistral3.py
index f33b3e0e1f8a..da6b21733c20 100644
--- a/tests/models/mistral3/test_modeling_mistral3.py
+++ b/tests/models/mistral3/test_modeling_mistral3.py
@@ -230,9 +230,6 @@ def test_sdpa_can_dispatch_on_flash(self):
     def test_flex_attention_with_grads(self):
         pass
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @slow
 @require_torch_accelerator
diff --git a/tests/models/mistral4/test_modeling_mistral4.py b/tests/models/mistral4/test_modeling_mistral4.py
index 449e13461264..8651591ac5d5 100644
--- a/tests/models/mistral4/test_modeling_mistral4.py
+++ b/tests/models/mistral4/test_modeling_mistral4.py
@@ -18,7 +18,7 @@
 
 import pytest
 
-from transformers import AutoTokenizer, Mistral3ForConditionalGeneration, is_torch_available
+from transformers import AutoTokenizer, Cache, Mistral3ForConditionalGeneration, is_torch_available
 from transformers.testing_utils import (
     Expectations,
     backend_empty_cache,
@@ -44,17 +44,49 @@
 
 
 class Mistral4ModelTester(CausalLMModelTester):
+    hidden_act = "silu"
+    q_lora_rank = 8
+    kv_lora_rank = 8
+    qk_rope_head_dim = 8
+    qk_nope_head_dim = 8
+    v_head_dim = 8
+    n_routed_experts = 8
+    n_group = 2
+    topk_group = 1
+
     if is_torch_available():
         base_model_class = Mistral4Model
 
 
 @require_torch
-@unittest.skip("Causing a lot of failures on CI")
 class Mistral4ModelTest(CausalLMModelTest, unittest.TestCase):
     _is_stateful = True
     model_split_percents = [0.5, 0.6]
     model_tester_class = Mistral4ModelTester
 
+    def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_length, config):
+        # generic test expects:
+        # keys   -> (batch, kv_heads, seq_len, head_dim)
+        # values -> (batch, kv_heads, seq_len, head_dim)
+        #
+        # but Mistral4 actually stores:
+        # keys   -> (batch, kv_heads, seq_len, qk_nope_head_dim + qk_rope_head_dim)
+        # values -> (batch, kv_heads, seq_len, v_head_dim)
+        # so we override the shape check to assert the real cache format instead of failing on a wrong expectation.
+        self.assertIsInstance(past_key_values, Cache)
+
+        expected_common_shape = (
+            batch_size,
+            getattr(config, "num_key_value_heads", config.num_attention_heads),
+            seq_length,
+        )
+        expected_key_shape = expected_common_shape + (config.qk_nope_head_dim + config.qk_rope_head_dim,)
+        expected_value_shape = expected_common_shape + (config.v_head_dim,)
+
+        for layer in past_key_values.layers:
+            self.assertEqual(layer.keys.shape, expected_key_shape)
+            self.assertEqual(layer.values.shape, expected_value_shape)
+
     # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
     def is_pipeline_test_to_skip(
         self,
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 1b56c8c6e5a8..6db2f45a341e 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -89,6 +89,14 @@ def test_load_balancing_loss(self):
         self.assertEqual(result.router_logits[0].shape, (91, config.num_local_experts))
         torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(2, dtype=torch.float32), rtol=1e-2, atol=1e-2)
 
+        # Verify router_logits are raw logits, not softmax probabilities (regression test for double-softmax bug)
+        for layer_logits in result.router_logits:
+            row_sums = layer_logits.sum(dim=-1)
+            self.assertFalse(
+                torch.allclose(row_sums, torch.ones_like(row_sums), atol=1e-3),
+                "router_logits should be raw logits (row sums != 1.0), not softmax probabilities",
+            )
+
         # First, we make sure that adding padding tokens doesn't change the loss
         # loss(input_ids, attention_mask=None) == loss(input_ids + padding, attention_mask=attention_mask_with_padding)
         pad_length = input_ids.shape[1] * 4
diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py
index dbcf88869deb..a898244254d0 100644
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -452,9 +452,6 @@ def test_left_padding_compatibility(self):
             unpadded_custom_inputs=unpadded_custom_inputs, padded_custom_inputs=padded_custom_inputs
         )
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index 6399608b29bc..6fda37e5effa 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -327,9 +327,6 @@ def test_attention_mask_with_token_types(self):
                                     f"Found non-zero attention weights for padding token at batch {batch_idx}, sequence position {seq_idx}",
                                 )
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @slow
 @require_torch
diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
index 6274f26ea605..27acba1a2aff 100644
--- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import unittest
 
 import pytest
@@ -110,8 +111,8 @@ def __init__(
         self.eos_token_id = eos_token_id
         self.image_token_id = image_token_id
         self.audio_token_id = audio_token_id
-        self.audio_config = audio_config
-        self.vision_config = vision_config
+        self.audio_config = copy.deepcopy(audio_config)
+        self.vision_config = copy.deepcopy(vision_config)
 
         self.is_training = is_training
         self.batch_size = batch_size
@@ -276,13 +277,13 @@ def test_flex_attention_with_grads(self):
 @slow
 class Phi4MultimodalIntegrationTest(unittest.TestCase):
     checkpoint_path = "microsoft/Phi-4-multimodal-instruct"
-    revision = "refs/pr/70"
+    revision = "refs/pr/94"
     image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg"
     audio_url = "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav"
 
     def setUp(self):
         # Currently, the Phi-4 checkpoint on the hub is not working with the latest Phi-4 code, so the slow integration tests
-        # won't pass without using the correct revision (refs/pr/70)
+        # won't pass without using the correct revision (refs/pr/94)
         self.processor = AutoProcessor.from_pretrained(self.checkpoint_path, revision=self.revision)
         self.generation_config = GenerationConfig(max_new_tokens=20, do_sample=False)
         self.user_token = "<|user|>"
diff --git a/tests/models/phi4_multimodal/test_processing_phi4_multimodal.py b/tests/models/phi4_multimodal/test_processing_phi4_multimodal.py
index 343768c0bb5f..a8c3f0db4db2 100644
--- a/tests/models/phi4_multimodal/test_processing_phi4_multimodal.py
+++ b/tests/models/phi4_multimodal/test_processing_phi4_multimodal.py
@@ -32,7 +32,7 @@
 class Phi4MultimodalProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = Phi4MultimodalProcessor
     checkpoint_path = "microsoft/Phi-4-multimodal-instruct"
-    revision = "refs/pr/70"
+    revision = "refs/pr/94"
     text_input_name = "input_ids"
     images_input_name = "image_pixel_values"
     audio_input_name = "audio_input_features"
diff --git a/tests/models/qianfan_ocr/test_modeling_qianfan_ocr.py b/tests/models/qianfan_ocr/test_modeling_qianfan_ocr.py
index b108f3b0922b..1a101ddc5904 100644
--- a/tests/models/qianfan_ocr/test_modeling_qianfan_ocr.py
+++ b/tests/models/qianfan_ocr/test_modeling_qianfan_ocr.py
@@ -191,6 +191,7 @@ def test_model_integration_forward(self):
             {
                 ("cuda", (8, 6)): torch.tensor([10.1250, 15.8125, 13.0625, 12.3125,  9.4375]),
                 ("cuda", (8, 9)): torch.tensor([10.0625, 15.6875, 13.0000, 12.1875,  9.3750]),
+                ("xpu", None): torch.tensor([10.1875, 15.8750, 13.1875, 12.3750,  9.6250]),
             }
         )  # fmt: skip
         self.assertTrue(
@@ -225,6 +226,7 @@ def test_model_integration_generate(self):
             {
                 ("cuda", (8, 6)): "The image features two striped cats lying down and sleeping on a pink couch. They",
                 ("cuda", (8, 9)): "The image features two striped cats lying down on a pink couch, seemingly asleep.",
+                ("xpu", None): "The image features two striped cats lying down on a couch, both appearing to be",
             }
         )  # fmt: skip
         self.assertEqual(decoded, expected_outputs.get_expectation())
@@ -247,6 +249,7 @@ def test_model_integration_generate_text_only(self):
         expected_outputs = Expectations(
             {
                 ("cuda", None): "1 + 1 equals 2.",
+                ("xpu", None): "1 + 1 equals 2.",
             }
         )  # fmt: skip
         self.assertEqual(decoded, expected_outputs.get_expectation())
@@ -295,12 +298,14 @@ def test_model_integration_batched_generate(self):
         expected_outputs_0 = Expectations(
             {
                 ("cuda", None): "In the tranquil setting of this image, two tabby cats are the stars of",
+                ("xpu", None): "In the tranquil setting of this image, two tabby cats are the stars of",
             }
         )  # fmt: skip
         expected_outputs_1 = Expectations(
             {
                 ("cuda", (8, 6)): "The image features two striped cats lying down and sleeping on a pink couch. The",
                 ("cuda", (8, 9)): "The image features two striped cats lying down on a pink couch, seemingly asleep.",
+                ("xpu", None): "The image features two striped cats lying down on a couch, both appearing to be",
             }
         )  # fmt: skip
         self.assertEqual(decoded_0, expected_outputs_0.get_expectation())
diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
index 5a425b434e7d..374f3fc4ed27 100644
--- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
+++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
@@ -441,9 +441,6 @@ def attention_mask_padding_matches_padding_free_with_position_ids(
                 tol = torch.finfo(torch.bfloat16).eps
                 torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
     @unittest.skip(reason="Feedforward chunking is not yet supported")
     def test_feed_forward_chunking(self):
         pass
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index 8776ccdb27dc..8c52fd834278 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -92,6 +92,14 @@ def test_load_balancing_loss(self):
         self.assertEqual(result.router_logits[0].shape, (91, config.num_experts))
         torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(2, dtype=torch.float32), rtol=1e-2, atol=1e-2)
 
+        # Verify router_logits are raw logits, not softmax probabilities (regression test for double-softmax bug)
+        for layer_logits in result.router_logits:
+            row_sums = layer_logits.sum(dim=-1)
+            self.assertFalse(
+                torch.allclose(row_sums, torch.ones_like(row_sums), atol=1e-3),
+                "router_logits should be raw logits (row sums != 1.0), not softmax probabilities",
+            )
+
         # First, we make sure that adding padding tokens doesn't change the loss
         # loss(input_ids, attention_mask=None) == loss(input_ids + padding, attention_mask=attention_mask_with_padding)
         pad_length = input_ids.shape[1] * 4
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 6027feac66fe..0ac8cd1bd385 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -411,9 +411,6 @@ def attention_mask_padding_matches_padding_free_with_position_ids(
                 tol = torch.finfo(torch.bfloat16).eps
                 torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
     @unittest.skip(reason="Feedforward chunking is not yet supported")
     def test_feed_forward_chunking(self):
         pass
diff --git a/tests/models/qwen3/test_modeling_qwen3.py b/tests/models/qwen3/test_modeling_qwen3.py
index 6fa304662caf..623befa163b9 100644
--- a/tests/models/qwen3/test_modeling_qwen3.py
+++ b/tests/models/qwen3/test_modeling_qwen3.py
@@ -694,3 +694,20 @@ def test_600m_generation(self):
         new_generated_ids = model.generate(input_ids, max_new_tokens=50)[:, input_ids.shape[1] :]
         with self.subTest("Eager matches flash attention"):
             torch.testing.assert_close(generated_ids, new_generated_ids, rtol=1e-4, atol=1e-4)
+
+    def test_qwen3_greedy_determinism(self):
+        """
+        Ensures Qwen3 generate is deterministic when do_sample=False (greedy decoding as per HFs documentation).
+        """
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base", use_fast=False)
+        model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base", device_map="auto")
+        inputs = tokenizer("hello", return_tensors="pt")
+
+        cfg = GenerationConfig(do_sample=False, num_beams=1, max_new_tokens=20)
+
+        out1 = model.generate(**inputs, generation_config=cfg)
+        out2 = model.generate(**inputs, generation_config=cfg)
+
+        assert torch.equal(out1, out2), (
+            "Qwen3 should produce deterministic outputs with do_sample=False and num_beams=1"
+        )
diff --git a/tests/models/qwen3_5/test_modeling_qwen3_5.py b/tests/models/qwen3_5/test_modeling_qwen3_5.py
index 668a4e513970..8cf1b5137109 100644
--- a/tests/models/qwen3_5/test_modeling_qwen3_5.py
+++ b/tests/models/qwen3_5/test_modeling_qwen3_5.py
@@ -16,10 +16,13 @@
 import copy
 import unittest
 
-from transformers import AutoProcessor, AutoTokenizer, is_torch_available
+from transformers import AutoProcessor, AutoTokenizer, DataCollatorWithFlattening, is_torch_available
 from transformers.testing_utils import (
     cleanup,
+    require_causal_conv1d,
+    require_flash_linear_attention,
     require_torch,
+    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -45,6 +48,7 @@
         Qwen3_5ForSequenceClassification,
         Qwen3_5Model,
         Qwen3_5TextConfig,
+        Qwen3_5TextForSequenceClassification,
         Qwen3_5TextModel,
     )
 
@@ -53,10 +57,11 @@ class Qwen3_5TextModelTester(CausalLMModelTester):
     if is_torch_available():
         base_model_class = Qwen3_5TextModel
         causal_lm_class = Qwen3_5ForCausalLM
-        sequence_classification_class = Qwen3_5ForSequenceClassification
+        sequence_classification_class = Qwen3_5TextForSequenceClassification
 
     def __init__(self, parent):
         super().__init__(parent=parent)
+        self.hidden_act = "silu"
         self.layer_types = ["full_attention", "linear_attention"]
         self.linear_conv_kernel_dim = 2
         self.linear_key_head_dim = 16
@@ -183,6 +188,53 @@ def test_linear_attention_multi_token_cached_forward_matches_single_token(self):
 
         torch.testing.assert_close(under_test_first, ref_first, rtol=1e-4, atol=1e-4)
 
+    @require_causal_conv1d
+    @require_flash_linear_attention
+    @require_torch_gpu
+    @slow
+    def test_padding_free_matches_padded_fast_path_regression(self):
+        torch.manual_seed(0)
+        config = self.model_tester.get_config()
+        model = Qwen3_5ForCausalLM(config).to(torch_device).eval()
+
+        data_collator = DataCollatorWithFlattening(
+            return_tensors="pt", return_seq_idx=True, return_flash_attn_kwargs=True, return_cu_seqlens=True
+        )
+        test_cases = [
+            (
+                torch.tensor([[0, 0, 0, 1, 2, 3], [0, 0, 0, 0, 4, 5]], device=torch_device),
+                torch.tensor([[0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 1, 1]], dtype=torch.long, device=torch_device),
+                [{"input_ids": [1, 2, 3]}, {"input_ids": [4, 5]}],
+            ),
+            (
+                torch.tensor([[0, 1, 2, 3, 4, 5], [0, 0, 0, 0, 0, 6]], device=torch_device),
+                torch.tensor([[0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1]], dtype=torch.long, device=torch_device),
+                [{"input_ids": [1, 2, 3, 4, 5]}, {"input_ids": [6]}],
+            ),
+        ]
+
+        for padded_input_ids, attention_mask, features in test_cases:
+            position_ids = ((attention_mask == 1).long().cumsum(dim=1) - 1) * (attention_mask == 1).long()
+            padding_free_batch = data_collator(features)
+            padding_free_batch = {
+                key: value.to(torch_device) if torch.is_tensor(value) else value
+                for key, value in padding_free_batch.items()
+            }
+
+            with torch.no_grad():
+                res_padded = model(
+                    input_ids=padded_input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    use_cache=False,
+                )
+                res_padfree = model(**padding_free_batch, use_cache=False)
+
+            logits_padded = res_padded.logits[attention_mask.bool()]
+            logits_padfree = res_padfree.logits[0]
+
+            torch.testing.assert_close(logits_padded, logits_padfree, atol=1e-5, rtol=1e-5)
+
 
 class Qwen3_5VisionText2TextModelTester:
     def __init__(
@@ -332,6 +384,7 @@ class Qwen3_5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
         (
             Qwen3_5Model,
             Qwen3_5ForConditionalGeneration,
+            Qwen3_5ForSequenceClassification,
         )
         if is_torch_available()
         else ()
diff --git a/tests/models/qwen3_moe/test_modeling_qwen3_moe.py b/tests/models/qwen3_moe/test_modeling_qwen3_moe.py
index 13fafa6969ae..2246364f8b6b 100644
--- a/tests/models/qwen3_moe/test_modeling_qwen3_moe.py
+++ b/tests/models/qwen3_moe/test_modeling_qwen3_moe.py
@@ -78,7 +78,7 @@ def test_load_balancing_loss(self):
         attention_mask = input_ids.ne(1).to(torch_device)
         model = Qwen3MoeForCausalLM(config)
         model.to(torch_device)
-        model.eval()
+        model.train()
         result = model(input_ids, attention_mask=attention_mask)
         self.assertEqual(result.router_logits[0].shape, (91, config.num_experts))
         torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(2, dtype=torch.float32), rtol=1e-2, atol=1e-2)
diff --git a/tests/models/qwen3_vl/test_processing_qwen3_vl.py b/tests/models/qwen3_vl/test_processing_qwen3_vl.py
index bae615621976..1a4874ac39c8 100644
--- a/tests/models/qwen3_vl/test_processing_qwen3_vl.py
+++ b/tests/models/qwen3_vl/test_processing_qwen3_vl.py
@@ -68,6 +68,25 @@ def test_model_input_names(self):
 
         self.assertSetEqual(set(inputs.keys()), set(processor.model_input_names))
 
+    @require_vision
+    @require_torch
+    @require_torchvision
+    def test_multiple_images_per_sample_preserves_batch(self):
+        # Build a processor from the small tmp pretrained saved in setUpClass
+        processor = self.get_processor()
+        # Create two samples: first has 2 images, second has 1 image
+        img1 = np.zeros((224, 224, 3), dtype=np.uint8)
+        img2 = np.zeros((224, 224, 3), dtype=np.uint8)
+        images = [[img1, img2], [img1]]
+        text = ["caption one", "caption two"]
+
+        inputs = processor(images=images, text=text, return_tensors="np", padding=True)
+        pixel_values = inputs["pixel_values"]
+
+        # Should preserve batch dimension (batch-first) and return an ndarray when tensors='np'
+        self.assertIsInstance(pixel_values, np.ndarray)
+        self.assertEqual(pixel_values.shape[0], len(images))
+
     @require_torch
     @require_av
     def _test_apply_chat_template(
diff --git a/tests/models/sam_hq/test_modeling_sam_hq.py b/tests/models/sam_hq/test_modeling_sam_hq.py
index bf0720003663..05e3a8df692c 100644
--- a/tests/models/sam_hq/test_modeling_sam_hq.py
+++ b/tests/models/sam_hq/test_modeling_sam_hq.py
@@ -28,6 +28,7 @@
     pipeline,
 )
 from transformers.testing_utils import Expectations, cleanup, require_torch, slow, torch_device
+from transformers.trainer_utils import set_seed
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -780,6 +781,11 @@ def prepare_dog_img():
 
 @slow
 class SamHQModelIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        super().setUp()
+        # Positional embeddings are randomly initialized when loading the checkpoint.
+        set_seed(0)
+
     def tearDown(self):
         super().tearDown()
         # clean-up as much as possible GPU memory occupied by PyTorch
diff --git a/tests/models/segformer/test_image_processing_segformer.py b/tests/models/segformer/test_image_processing_segformer.py
index 178e8f50529a..d6345ade6f4b 100644
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@@ -15,6 +15,7 @@
 
 import unittest
 
+import numpy as np
 from datasets import load_dataset
 
 from transformers.testing_utils import require_torch, require_vision
@@ -252,6 +253,26 @@ def test_reduce_labels(self):
             encoding = image_processing(image, map, return_tensors="pt")
             self.assertTrue(len(encoding["labels"]) == len(map))
 
+    def test_reduce_labels_keeps_void_label(self):
+        image = np.zeros((2, 2, 3), dtype=np.uint8)
+        segmentation_map = np.array([[0, 1], [2, 255]], dtype=np.uint8)
+        expected_labels = torch.tensor([[[255, 0], [1, 255]]], dtype=torch.long)
+        image_processor_kwargs = self.image_processor_dict.copy()
+        image_processor_kwargs.update(
+            {
+                "do_resize": False,
+                "do_rescale": False,
+                "do_normalize": False,
+                "do_reduce_labels": True,
+            }
+        )
+
+        for image_processing_class in self.image_processing_classes.values():
+            image_processing = image_processing_class(**image_processor_kwargs)
+
+            encoding = image_processing(image, segmentation_map, return_tensors="pt")
+            self.assertTrue(torch.equal(encoding["labels"], expected_labels))
+
     def test_backends_equivalence(self):
         if len(self.image_processing_classes) < 2:
             self.skipTest(reason="Skipping backends equivalence test as there are less than 2 backends")
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 65162e94b6fd..63ea9902b3e5 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -930,6 +930,50 @@ def test_max_routing_capacity(self):
 
         assert torch.sum(expert_index) <= batch_size * self.config.num_experts * self.config.expert_capacity
 
+    def test_jitter_noise_preserves_hidden_states(self):
+        r"""
+        Test that jitter noise is applied only to routing decisions and does not modify the original hidden states.
+        This tests the fix for the jitter noise issue where noise was corrupting the input hidden states.
+        """
+        # Create a config with jitter noise enabled
+        config = SwitchTransformersConfig(
+            num_experts=2,
+            hidden_size=4,
+            d_ff=8,
+            router_jitter_noise=0.1,  # Enable jitter noise
+            expert_capacity=4,
+        )
+
+        # Create router
+        router = SwitchTransformersTop1Router(config)
+        router.eval()  # Set to eval mode first to test training mode separately
+
+        # Create input hidden states
+        hidden_states = torch.tensor([[[0.5, 0.2, 0.1, 0.3], [0.4, 0.6, 0.2, 0.8]]], dtype=torch.float32)
+
+        # Test in eval mode - no jitter noise should be applied
+        original_hidden_states = hidden_states.clone()
+        with torch.no_grad():
+            router_probs, expert_index, router_logits = router(hidden_states)
+
+        # Hidden states should remain unchanged in eval mode
+        self.assertTrue(torch.equal(hidden_states, original_hidden_states))
+
+        # Test in training mode - jitter noise should be applied only internally
+        router.train()
+        torch.manual_seed(42)  # Set seed for reproducible results
+
+        original_hidden_states = hidden_states.clone()
+        with torch.no_grad():
+            router_probs_train, expert_index_train, router_logits_train = router(hidden_states)
+
+        # Hidden states should still remain unchanged after router call
+        self.assertTrue(torch.equal(hidden_states, original_hidden_states))
+
+        # Results should be different between eval and train mode due to jitter noise
+        # (though this might occasionally fail due to randomness, it's very unlikely with seed)
+        self.assertFalse(torch.allclose(router_logits, router_logits_train, atol=1e-5))
+
 
 @slow
 @require_torch
diff --git a/tests/models/t5gemma/test_generation_t5gemma.py b/tests/models/t5gemma/test_generation_t5gemma.py
new file mode 100644
index 000000000000..729177de2a0d
--- /dev/null
+++ b/tests/models/t5gemma/test_generation_t5gemma.py
@@ -0,0 +1,32 @@
+import torch
+
+from transformers import T5GemmaConfig, T5GemmaForConditionalGeneration, T5GemmaModuleConfig
+
+
+def _tiny():
+    return T5GemmaModuleConfig(
+        vocab_size=33,
+        hidden_size=32,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=4,
+        head_dim=32,
+        max_position_embeddings=1024,
+        tie_word_embeddings=False,
+        layer_types=["full_attention"] * 2,
+        rope_theta=10000,
+        bos_token_id=0,
+        eos_token_id=1,
+        pad_token_id=2,
+    )
+
+
+def test_generate_use_cache_works_for_t5gemma():
+    cfg = T5GemmaConfig(encoder=_tiny(), decoder=_tiny(), vocab_size=33, attn_implementation="eager")
+    model = T5GemmaForConditionalGeneration(cfg)
+
+    output = model.generate(torch.randint(0, 33, (1, 10)), use_cache=True, max_new_tokens=2)
+
+    assert output.shape[0] == 1
+    assert output.shape[1] > 0
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index fa8b24a40aa7..4a0f936c724e 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -16,7 +16,8 @@
 import inspect
 import unittest
 
-from transformers import AutoBackbone
+from transformers import AutoBackbone, MaskFormerConfig
+from transformers.backbone_utils import load_backbone
 from transformers.testing_utils import is_flaky, require_timm, require_torch, torch_device
 from transformers.utils.import_utils import is_torch_available
 
@@ -259,3 +260,26 @@ def test_create_from_modified_config(self):
 
             self.assertEqual(len(result.feature_maps), 1)
             self.assertEqual(len(model.channels), 1)
+
+
+@require_torch
+@require_timm
+class TimmBackboneIntegrationTest(unittest.TestCase):
+    def test_load_timm_backbone_from_config(self):
+        config = MaskFormerConfig(backbone_config=TimmBackboneConfig(backbone="resnet18", out_indices=[0, 2]))
+        backbone = load_backbone(config)
+        self.assertEqual(backbone.out_indices, [0, 2])
+        self.assertIsInstance(backbone, TimmBackbone)
+
+    def test_load_timm_backbone_from_checkpoint(self):
+        config = MaskFormerConfig(backbone="resnet18", use_timm_backbone=True)
+        backbone = load_backbone(config)
+        self.assertEqual(backbone.out_indices, [-1])
+        self.assertEqual(backbone.out_features, ["layer4"])
+        self.assertIsInstance(backbone, TimmBackbone)
+
+    def test_load_timm_backbone_with_kwargs(self):
+        config = MaskFormerConfig(backbone="resnet18", use_timm_backbone=True, backbone_kwargs={"out_indices": (0, 1)})
+        backbone = load_backbone(config)
+        self.assertEqual(backbone.out_indices, [0, 1])
+        self.assertIsInstance(backbone, TimmBackbone)
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 8d5995ae8a30..533e3c10a00a 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -397,9 +397,6 @@ def test_vision_feature_layers(self, vision_feature_layer):
             assert base_model.multi_modal_projector.linear_1.in_features == expected_features
             model(**input_dict)
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index 5b06d8659145..24bb47deda74 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -274,9 +274,6 @@ def test_training_gradient_checkpointing_use_reentrant_true(self):
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
-    def test_reverse_loading_mapping(self):
-        super().test_reverse_loading_mapping(skip_base_model=True)
-
 
 @require_torch
 class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/voxtral/test_tokenization_voxtral.py b/tests/models/voxtral/test_tokenization_voxtral.py
new file mode 100644
index 000000000000..7f17f7a1aae4
--- /dev/null
+++ b/tests/models/voxtral/test_tokenization_voxtral.py
@@ -0,0 +1,12 @@
+import pytest
+
+from transformers import AutoTokenizer
+from transformers.models.auto import tokenization_auto
+from transformers.models.voxtral import VoxtralConfig
+
+
+def test_voxtral_tokenizer_requires_mistral_common(monkeypatch):
+    monkeypatch.setattr(tokenization_auto, "is_mistral_common_available", lambda: False)
+    monkeypatch.setattr(tokenization_auto, "get_tokenizer_config", lambda *args, **kwargs: {})
+    with pytest.raises(ImportError, match="mistral-common"):
+        AutoTokenizer.from_pretrained("dummy", config=VoxtralConfig())
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 8feae0aedd5b..4ff9ce04410e 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1240,6 +1240,56 @@ def _load_datasamples(self, num_samples):
         speech_samples = ds.sort("id")[:num_samples]["audio"]
         return [x["array"] for x in speech_samples]
 
+    @slow
+    def test_retrieve_segment(self):
+        set_seed(0)
+        torch_device = "cpu"
+        # model doesn't matter since _retrieve_segment is a static method
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model = model.to(torch_device)
+        return_token_timestamps = False
+        # the test tokens are from whisper-large-v3
+        input_dict = {
+            "seek_sequence": torch.tensor([50365, 415, 1619, 11, 411, 257, 27484, 260, 294, 257, 50473]),
+            "seek_outputs": [
+                torch.tensor([50258, 50259, 50360, 50365, 415, 1619, 11, 411, 257, 27484, 260, 294, 257, 50473, 50257])
+            ],
+            "time_offset": torch.tensor([27.8200], dtype=torch.float64),
+            "timestamp_begin": 50365,
+            "seek_num_frames": torch.tensor([218]),
+            "time_precision": 0.02,
+            "time_precision_features": 0.01,
+            "input_stride": 2,
+            "prev_idx": 0,
+            "idx": 0,
+            "return_token_timestamps": return_token_timestamps,
+            "decoder_input_ids": torch.tensor([[50258, 50259, 50360]]),
+            "max_frames": 3000,
+        }
+        result_segments, result_segment_offset = model._retrieve_segment(**input_dict)
+
+        EXPECTED_SEGMENT_LIST = [
+            {
+                "start": torch.tensor(27.8200, dtype=torch.float64),
+                "end": torch.tensor(29.9800, dtype=torch.float64),
+                "tokens": torch.tensor([51756, 415, 1619, 11, 411, 257, 27484, 260, 294, 257, 51864]),
+                "idxs": (3, 14),
+                "result": torch.tensor(
+                    [50258, 50259, 50360, 51756, 415, 1619, 11, 411, 257, 27484, 260, 294, 257, 51864, 50257],
+                ),
+            }
+        ]
+        EXPECTED_SEGMENT_OFFSET = 218
+
+        for result, expected in zip(result_segments, EXPECTED_SEGMENT_LIST):
+            self.assertEqual(result["start"], expected["start"])
+            self.assertEqual(result["end"], expected["end"])
+            self.assertEqual(result["idxs"], expected["idxs"])
+            torch.testing.assert_close(result["tokens"], expected["tokens"])
+            torch.testing.assert_close(result["result"], expected["result"])
+
+        self.assertEqual(result_segment_offset, EXPECTED_SEGMENT_OFFSET)
+
     @slow
     def test_tiny_logits_librispeech(self):
         torch_device = "cpu"
@@ -1376,7 +1426,7 @@ def test_tiny_en_generation(self):
         input_features = processor(input_speech, return_tensors="pt", sampling_rate=16_000).input_features
         input_features = input_features.to(torch_device)
 
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20)
+        generated_ids = model.generate(input_features, num_beams=5, max_length=22)
         transcript = processor.tokenizer.batch_decode(generated_ids)[0]
 
         EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his"
@@ -1392,7 +1442,7 @@ def test_tiny_generation(self):
         input_features = processor(input_speech, return_tensors="pt", sampling_rate=16_000).input_features
         input_features = input_features.to(torch_device)
 
-        generated_ids = model.generate(input_features, num_beams=5, max_length=20)
+        generated_ids = model.generate(input_features, num_beams=5, max_length=24)
         transcript = processor.tokenizer.decode(generated_ids[0])
 
         EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel"
@@ -1401,7 +1451,7 @@ def test_tiny_generation(self):
     @slow
     def test_large_generation(self):
         processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3", dtype=torch.float32)
         model.to(torch_device)
 
         input_speech = self._load_datasamples(1)
@@ -1409,7 +1459,7 @@ def test_large_generation(self):
         input_features = input_features.to(torch_device)
 
         generated_ids = model.generate(
-            input_features, do_sample=False, max_length=20, language="<|en|>", task="transcribe"
+            input_features, do_sample=False, max_length=24, language="<|en|>", task="transcribe"
         )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
@@ -1419,7 +1469,7 @@ def test_large_generation(self):
     @slow
     def test_large_generation_multilingual(self):
         processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3", dtype=torch.float32)
         model.to(torch_device)
 
         ds = load_dataset("facebook/multilingual_librispeech", "german", split="test", streaming=True)
@@ -1430,14 +1480,14 @@ def test_large_generation_multilingual(self):
         input_features = input_features.to(torch_device)
 
         generated_ids = model.generate(
-            input_features, do_sample=False, max_length=20, language="<|de|>", task="transcribe"
+            input_features, do_sample=False, max_length=24, language="<|de|>", task="transcribe"
         )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         EXPECTED_TRANSCRIPT = " denken sie soeben weilten meine gedanken bei ihnen in adelaide und ich wünsch"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
         generated_ids = model.generate(
-            input_features, do_sample=False, max_length=20, language="<|de|>", task="translate"
+            input_features, do_sample=False, max_length=24, language="<|de|>", task="translate"
         )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         EXPECTED_TRANSCRIPT = " Think, my thoughts were just now in Adelaide with you, and I wished to be able"
@@ -1447,13 +1497,13 @@ def test_large_generation_multilingual(self):
     def test_large_batched_generation(self):
         set_seed(42)
         processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3", dtype=torch.float32)
         model.to(torch_device)
 
         input_speech = self._load_datasamples(4)
         input_features = processor(input_speech, return_tensors="pt", sampling_rate=16_000).input_features
         input_features = input_features.to(torch_device)
-        generated_ids = model.generate(input_features, max_length=20, task="translate")
+        generated_ids = model.generate(input_features, max_length=24, task="translate")
 
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
@@ -1507,7 +1557,7 @@ def test_large_batched_generation_multilingual(self):
         generated_ids = model.generate(
             input_features.repeat(2, 1, 1),
             do_sample=False,
-            max_length=20,
+            max_length=24,
             language=["<|ja|>", "<|en|>"],
             task="transcribe",
         )
@@ -1524,7 +1574,7 @@ def test_tiny_en_batched_generation(self):
         input_speech = self._load_datasamples(4)
         input_features = processor(input_speech, return_tensors="pt", sampling_rate=16_000).input_features
         input_features = input_features.to(torch_device)
-        generated_ids = model.generate(input_features, max_length=20).to("cpu")
+        generated_ids = model.generate(input_features, max_length=22).to("cpu")
 
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
@@ -1627,7 +1677,7 @@ def test_tiny_timestamp_generation(self):
     def test_distil_token_timestamp_generation(self):
         # we actually just want to check that returning segments with distil model works
         processor = WhisperProcessor.from_pretrained("distil-whisper/distil-large-v3")
-        model = WhisperForConditionalGeneration.from_pretrained("distil-whisper/distil-large-v3")
+        model = WhisperForConditionalGeneration.from_pretrained("distil-whisper/distil-large-v3", dtype=torch.float32)
         model.to(torch_device)
 
         input_speech = np.concatenate(self._load_datasamples(4))
@@ -1795,11 +1845,11 @@ def test_small_longform_timestamps_generation(self):
             },
             {
                 "text": " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and",
-                "timestamp": (39.80, 45.36),
+                "timestamp": (39.80, 45.38),
             },
             {
                 "text": " can discover in it but little of rocky Ithaca.",
-                "timestamp": (45.36, 49.0),
+                "timestamp": (45.38, 49.0),
             },
             {
                 "text": " Lenell's pictures are a sort of up-guards-and-atom paintings, and Mason's exquisite ittles",
@@ -1894,7 +1944,7 @@ def test_small_longform_timestamps_generation(self):
     def test_large_timestamp_generation(self):
         set_seed(42)
         processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3", dtype=torch.float32)
         model.to(torch_device)
 
         input_speech = np.concatenate(self._load_datasamples(4))
diff --git a/tests/quantization/compressed_tensors_integration/test_compressed_models.py b/tests/quantization/compressed_tensors_integration/test_compressed_models.py
index 51f13c8e6d2e..ba561a7aa514 100644
--- a/tests/quantization/compressed_tensors_integration/test_compressed_models.py
+++ b/tests/quantization/compressed_tensors_integration/test_compressed_models.py
@@ -120,7 +120,12 @@ def tearDown(self):
         gc.collect()
 
     def test_default_run_compressed__True(self):
-        from compressed_tensors import QuantizationStatus
+        from compressed_tensors import __version__ as ct_version
+        from compressed_tensors.quantization import QuantizationStatus
+        from packaging import version
+
+        if version.parse(ct_version) >= version.parse("0.14"):
+            self.skipTest("CompressedLinear removed in CT >= 0.14")
 
         for stub in self.stubs:
             model = AutoModelForCausalLM.from_pretrained(
@@ -134,9 +139,7 @@ def test_default_run_compressed__True(self):
             assert compressed_count > 0
 
     def test_default_run_compressed__False(self):
-        from compressed_tensors import QuantizationStatus
-
-        from transformers.utils.quantization_config import CompressedTensorsConfig
+        from compressed_tensors.quantization import QuantizationStatus
 
         quantization_config = CompressedTensorsConfig(run_compressed=False)
 
@@ -152,8 +155,34 @@ def test_default_run_compressed__False(self):
             # No modules should be in COMPRESSED state
             assert compressed_count == 0
 
+    def test_model_decompressed_after_loading(self):
+        """Verify that models are properly decompressed after loading for CT >= 0.14"""
+        from compressed_tensors import __version__ as ct_version
+        from compressed_tensors.quantization import QuantizationStatus
+        from packaging import version
+
+        if version.parse(ct_version) < version.parse("0.14"):
+            self.skipTest("Automatic decompression only applies to CT >= 0.14")
+
+        for stub in self.stubs:
+            model = AutoModelForCausalLM.from_pretrained(stub)
+            compressed_count = sum(
+                1 for m in model.modules() if getattr(m, "quantization_status", None) == QuantizationStatus.COMPRESSED
+            )
+            assert compressed_count == 0
+
     def test_run_compressed_outputs_match(self):
         """Check that run_compressed=True/False output are the same"""
+        from compressed_tensors import __version__ as ct_version
+        from packaging import version
+
+        if version.parse(ct_version) >= version.parse("0.14"):
+            self.skipTest("run_compressed no longer applies for CT >= 0.14")
+
+        try:
+            from compressed_tensors.linear.compressed_linear import CompressedLinear  # noqa: F401
+        except ImportError:
+            self.skipTest("CompressedLinear not available in this version of compressed-tensors")
 
         from transformers import AutoTokenizer
         from transformers.utils.quantization_config import CompressedTensorsConfig
diff --git a/tests/quantization/config/__init__.py b/tests/quantization/config/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/config/test_from_config.py b/tests/quantization/config/test_from_config.py
new file mode 100644
index 000000000000..0a7bd92bc031
--- /dev/null
+++ b/tests/quantization/config/test_from_config.py
@@ -0,0 +1,14 @@
+import pytest
+
+from transformers import AutoConfig, AutoModel
+
+
+def test_quantization_from_config_raises():
+    config = AutoConfig.from_pretrained("gpt2")
+    config.quantization_config = {"quant_method": "fp8"}
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Quantization via",
+    ):
+        AutoModel.from_config(config)
diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py
index aa5cdbc7adc6..975edf788169 100644
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -1056,6 +1056,22 @@ def test_deci_config_mapping(self):
 
         self.assertIsNone(deci_mapping["rope.dimension_count"])
 
+    def test_gemma_softcap_config_mapping(self):
+        """Test that Gemma2/Gemma3 GGUF config mapping includes attn_logit_softcapping."""
+        from transformers.integrations.ggml import GGUF_CONFIG_MAPPING
+
+        # Test Gemma2
+        self.assertIn("gemma2", GGUF_CONFIG_MAPPING)
+        gemma2_mapping = GGUF_CONFIG_MAPPING["gemma2"]
+        self.assertIn("attention.logit_softcapping", gemma2_mapping)
+        self.assertEqual(gemma2_mapping["attention.logit_softcapping"], "attn_logit_softcapping")
+
+        # Test Gemma3
+        self.assertIn("gemma3", GGUF_CONFIG_MAPPING)
+        gemma3_mapping = GGUF_CONFIG_MAPPING["gemma3"]
+        self.assertIn("attention.logit_softcapping", gemma3_mapping)
+        self.assertEqual(gemma3_mapping["attention.logit_softcapping"], "attn_logit_softcapping")
+
     def test_deci_architecture_mapping(self):
         """Test that Deci architectures are mapped to GGUFLlamaConverter."""
         from transformers.integrations.ggml import GGUF_TO_FAST_CONVERTERS, GGUFLlamaConverter
diff --git a/tests/quantization/hqq/test_hqq.py b/tests/quantization/hqq/test_hqq.py
index 913bf6bf9e75..ad2797229fa5 100755
--- a/tests/quantization/hqq/test_hqq.py
+++ b/tests/quantization/hqq/test_hqq.py
@@ -14,7 +14,6 @@
 
 import gc
 import unittest
-from unittest import skip
 
 import accelerate
 
@@ -106,7 +105,6 @@ def test_to_dict(self):
 @require_torch_accelerator
 @require_accelerate
 @require_hqq
-@skip("skip for now until we add back support")
 class HQQTest(unittest.TestCase):
     def tearDown(self):
         cleanup()
@@ -164,7 +162,6 @@ def test_quantized_model_fake_weight_dtype(self):
 @require_torch_multi_accelerator
 @require_accelerate
 @require_hqq
-@skip("skip for now until we add back support")
 class HQQTestMultiGPU(unittest.TestCase):
     def tearDown(self):
         cleanup()
@@ -188,7 +185,6 @@ def test_fp16_quantized_model_multipgpu(self):
 @require_torch_accelerator
 @require_accelerate
 @require_hqq
-@skip("skip for now until we add back support")
 class HQQTestBias(unittest.TestCase):
     def tearDown(self):
         cleanup()
@@ -245,7 +241,6 @@ def test_save_and_load_quantized_model(self):
 @require_torch_accelerator
 @require_accelerate
 @require_hqq
-@skip("skip for now until we add back support")
 class HQQSerializationTest(unittest.TestCase):
     def tearDown(self):
         cleanup()
diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py
index b188b4f9a0c3..6d405160d49e 100644
--- a/tests/quantization/torchao_integration/test_torchao.py
+++ b/tests/quantization/torchao_integration/test_torchao.py
@@ -36,9 +36,6 @@
     import torch
 
 if is_torchao_available():
-    from torchao.dtypes import (
-        AffineQuantizedTensor,
-    )
     from torchao.prototype.mx_formats import NVFP4DynamicActivationNVFP4WeightConfig
     from torchao.quantization import (
         Float8DynamicActivationFloat8WeightConfig,
@@ -53,6 +50,9 @@
         MappingType,
         PerAxis,
     )
+    from torchao.utils import (
+        TorchAOBaseTensor,
+    )
 
 
 @require_torchao
@@ -192,7 +192,7 @@ def test_per_module_config_skip(self):
             torch_dtype=torch.bfloat16,
         )
         # making sure `model.layers.0.self_attn.q_proj` is skipped
-        self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, TorchAOBaseTensor))
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
         input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
@@ -212,7 +212,7 @@ def test_fqn_to_config_regex_basic(self):
             torch_dtype=torch.bfloat16,
         )
         # making sure `model.layers.0.self_attn.q_proj` is skipped
-        self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, TorchAOBaseTensor))
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
         input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
@@ -245,7 +245,7 @@ def test_fqn_to_config_regex_fullmatch(self):
         self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
         # because regex `model\.layers\.+*\.self_attn\.q_pro` didin't fully match `model.layers.1.self_attn.q_proj` (missing last `j`)
         # this layer is not expected to be quantized to int8
-        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, TorchAOBaseTensor))
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
         input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
@@ -274,9 +274,9 @@ def test_fqn_to_config_module_regex_precedence(self):
         # highest precedence is fully specified module fqn
         self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
         # second precedence: regex
-        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, TorchAOBaseTensor))
         # last precedence: _default
-        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, TorchAOBaseTensor))
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
         input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
@@ -303,8 +303,8 @@ def test_fqn_to_config_regex_precedence(self):
             torch_dtype=torch.bfloat16,
         )
         self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor))
-        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
-        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, TorchAOBaseTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, TorchAOBaseTensor))
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
         input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
@@ -330,8 +330,8 @@ def test_fqn_to_config_param_over_module_regex_precedence(self):
             quantization_config=quant_config,
             torch_dtype=torch.bfloat16,
         )
-        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
-        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, TorchAOBaseTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, TorchAOBaseTensor))
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
         input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
@@ -357,8 +357,8 @@ def test_fqn_to_config_param_over_module_precedence(self):
             quantization_config=quant_config,
             torch_dtype=torch.bfloat16,
         )
-        self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
-        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.k_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, TorchAOBaseTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.k_proj.weight, TorchAOBaseTensor))
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
         input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
@@ -384,8 +384,8 @@ def test_fqn_to_config_exact_over_regex_precedence(self):
             quantization_config=quant_config,
             torch_dtype=torch.bfloat16,
         )
-        self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, AffineQuantizedTensor))
-        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(not isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, TorchAOBaseTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, TorchAOBaseTensor))
         self.assertTrue(isinstance(quantized_model.model.layers[2].self_attn.q_proj.weight, Float8Tensor))
 
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@@ -419,7 +419,7 @@ def test_fqn_to_config_non_weight_param(self):
         self.assertTrue(
             not isinstance(quantized_model.model.layers[0].feed_forward.experts.gate_up_proj, Float8Tensor)
         )
-        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor))
+        self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, TorchAOBaseTensor))
 
     def test_compute_module_sizes(self):
         r"""
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index f7c2bf3fbb76..5ee67a130c87 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -3118,11 +3118,12 @@ def test_load_with_mismatched_shapes(self):
                 with tempfile.TemporaryDirectory() as tmp_dir:
                     model = model_class(config)
                     model.save_pretrained(tmp_dir)
+                    config.get_text_config().vocab_size = 10
                     # Fails when we don't set ignore_mismatched_sizes=True
                     with self.assertRaises(RuntimeError):
                         new_model = AutoModelForSequenceClassification.from_pretrained(tmp_dir, num_labels=42)
                     with self.assertRaises(RuntimeError):
-                        new_model_without_prefix = AutoModel.from_pretrained(tmp_dir, vocab_size=10)
+                        new_model_without_prefix = AutoModel.from_pretrained(tmp_dir, config=config)
 
                     logger = logging.get_logger("transformers.modeling_utils")
 
@@ -3138,7 +3139,7 @@ def test_load_with_mismatched_shapes(self):
 
                     with CaptureLogger(logger) as cl:
                         new_model_without_prefix = AutoModel.from_pretrained(
-                            tmp_dir, vocab_size=10, ignore_mismatched_sizes=True
+                            tmp_dir, config=config, ignore_mismatched_sizes=True
                         )
                     self.assertIn("Reinit due to size mismatch", cl.out)
                     input_ids = ids_tensor((2, 8), 10)
@@ -4801,7 +4802,15 @@ def test_reverse_loading_mapping(self, check_keys_were_modified=True, skip_base_
                     # mess up the prefixes only if the loaded checkpoints were doing so as well)
                     if isinstance(conversion, PrefixChange):
                         continue
-                    for source_pattern in conversion.source_patterns:
+
+                    # Single pass over serialized_keys: the compiled regex already tests all
+                    # pattern branches at once, so one call per key is enough.
+                    matched_groups: set[str] = set()
+                    for key in serialized_keys:
+                        if (match := conversion._scoped_match(key)) is not None:
+                            matched_groups.add(match[2].lastgroup)  # "g0", "g1", ...
+
+                    for pattern_index, source_pattern in enumerate(conversion.source_patterns):
                         # Some patterns are written for gen-model only and won't be applied on base model
                         if "lm_head" in source_pattern and model_class not in [
                             *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
@@ -4818,9 +4827,9 @@ def test_reverse_loading_mapping(self, check_keys_were_modified=True, skip_base_
                                 target_pattern_reversed = target_pattern_reversed.replace(r"\1", captured_group)
                             if any(re.search(target_pattern_reversed, k) for k in model.all_tied_weights_keys.keys()):
                                 continue
-                        num_matches = sum(re.search(source_pattern, key) is not None for key in serialized_keys)
+
                         self.assertTrue(
-                            num_matches > 0,
+                            f"g{pattern_index}" in matched_groups,
                             f"`{source_pattern}` in `{conversion}` did not match any of the source keys. "
                             "This indicates whether that the pattern is not properly written, or that it could not be reversed correctly",
                         )
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 1bf52f0369dd..cd9b1d737b53 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -2018,6 +2018,21 @@ def test_apply_chat_template_tool_calls_no_content(self):
         result = processor.apply_chat_template(messages, tokenize=True)
         self.assertIsInstance(result, list)
 
+        # Also test with explicit content=None (OpenAI returns this for tool-call-only messages)
+        messages_with_none = [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": "What is the weather?"}],
+            },
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [{"type": "function", "function": {"name": "get_weather", "arguments": {}}}],
+            },
+        ]
+        result_none = processor.apply_chat_template(messages_with_none, tokenize=True)
+        self.assertIsInstance(result_none, list)
+
     def test_get_num_multimodal_tokens_matches_processor_call(self):
         "Tests that the helper used internally in vLLM works correctly"
 
diff --git a/tests/test_quantized_weight_initialization.py b/tests/test_quantized_weight_initialization.py
new file mode 100644
index 000000000000..4bb966957c80
--- /dev/null
+++ b/tests/test_quantized_weight_initialization.py
@@ -0,0 +1,69 @@
+import unittest
+
+import torch
+import torch.nn as nn
+
+from transformers import PretrainedConfig, PreTrainedModel
+
+
+class TestQuantizedWeightInitialization(unittest.TestCase):
+    """Test that quantized weights are not re-initialized during model loading."""
+
+    def test_int8_weights_skipped(self):
+        """Test that int8 weights are skipped during initialization."""
+
+        class TestConfig(PretrainedConfig):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self.initializer_range = 0.02
+
+        class TestModel(PreTrainedModel):
+            config_class = TestConfig
+
+            def __init__(self, config):
+                super().__init__(config)
+                self.linear = nn.Linear(10, 10)
+                # Simulate quantized weights
+                with torch.no_grad():
+                    self.linear.weight = nn.Parameter(self.linear.weight.to(torch.int8), requires_grad=False)
+
+        config = TestConfig()
+        model = TestModel(config)
+
+        # Store original weight
+        original_weight = model.linear.weight.clone()
+
+        # This should not raise an error and should not modify the weight
+        model._init_weights(model.linear)
+
+        # Verify weight unchanged and still int8
+        self.assertEqual(model.linear.weight.dtype, torch.int8)
+        self.assertTrue(torch.equal(model.linear.weight, original_weight))
+
+    def test_float_weights_initialized(self):
+        """Test that float weights are still properly initialized."""
+
+        class TestConfig(PretrainedConfig):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self.initializer_range = 0.02
+
+        class TestModel(PreTrainedModel):
+            config_class = TestConfig
+
+            def __init__(self, config):
+                super().__init__(config)
+                self.linear = nn.Linear(10, 10)
+
+        config = TestConfig()
+        model = TestModel(config)
+
+        # Store original weight
+        original_weight = model.linear.weight.clone()
+
+        # Initialize weights
+        model._init_weights(model.linear)
+
+        # Verify weight was modified and remains float32
+        self.assertEqual(model.linear.weight.dtype, torch.float32)
+        self.assertFalse(torch.equal(model.linear.weight, original_weight))
diff --git a/tests/test_special_tokens_fix.py b/tests/test_special_tokens_fix.py
new file mode 100644
index 000000000000..7952456b725b
--- /dev/null
+++ b/tests/test_special_tokens_fix.py
@@ -0,0 +1,61 @@
+"""
+Standalone test for the _set_model_specific_special_tokens fix.
+Uses a locally-created BertTokenizer to avoid Hub downloads.
+"""
+
+import json
+import os
+import shutil
+import tempfile
+import unittest
+
+from transformers import BertTokenizer
+
+from .test_tokenization_common import TokenizerTesterMixin
+
+
+def _create_local_bert_tokenizer(tmpdir):
+    """Create a minimal BertTokenizer saved locally (no Hub access needed)."""
+    tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
+    for c in "abcdefghijklmnopqrstuvwxyz":
+        tokens.append(c)
+    for w in ["the", "is", "a", "test", "hello", "world", "##s", "##ing", "##ed"]:
+        tokens.append(w)
+
+    with open(os.path.join(tmpdir, "vocab.txt"), "w") as f:
+        f.writelines(t + "\n" for t in tokens)
+
+    config = {
+        "model_type": "bert",
+        "tokenizer_class": "BertTokenizer",
+        "do_lower_case": True,
+    }
+    with open(os.path.join(tmpdir, "tokenizer_config.json"), "w") as f:
+        json.dump(config, f)
+
+    tok = BertTokenizer(os.path.join(tmpdir, "vocab.txt"))
+    tok.save_pretrained(tmpdir)
+    return tmpdir
+
+
+class TestSetModelSpecificSpecialTokens(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = BertTokenizer
+    from_pretrained_id = []  # empty — no Hub downloads
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tokenizers_list = []
+        fixtures_dir = os.path.join(os.path.dirname(__file__), "fixtures")
+        with open(os.path.join(fixtures_dir, "sample_text.txt"), encoding="utf-8") as f:
+            cls._data = f.read().replace("\n\n", "\n").strip()
+
+        cls.tmpdirname = tempfile.mkdtemp()
+        _create_local_bert_tokenizer(cls.tmpdirname)
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.tmpdirname, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 833134c2913f..7b19f6a28d25 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -630,6 +630,22 @@ def test_tokenize_special_tokens(self):
         # next is failing for almost all the Fast tokenizers now.
         # self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
 
+    def test_set_model_specific_special_tokens_with_list(self):
+        """_set_model_specific_special_tokens should accept a list of token strings (not only a dict)."""
+        tokenizer = self.get_tokenizer()
+        list_tokens = ["<|special_a|>", "<|special_b|>"]
+        tokenizer._set_model_specific_special_tokens(list_tokens)
+        self.assertIn("<|special_a|>", tokenizer.SPECIAL_TOKENS_ATTRIBUTES)
+        self.assertIn("<|special_b|>", tokenizer.SPECIAL_TOKENS_ATTRIBUTES)
+
+    def test_set_model_specific_special_tokens_with_dict(self):
+        """_set_model_specific_special_tokens should accept a dict of {name: token_value}."""
+        tokenizer = self.get_tokenizer()
+        dict_tokens = {"custom_a_token": "<|custom_a|>", "custom_b_token": "<|custom_b|>"}
+        tokenizer._set_model_specific_special_tokens(dict_tokens)
+        self.assertIn("custom_a_token", tokenizer.SPECIAL_TOKENS_ATTRIBUTES)
+        self.assertIn("custom_b_token", tokenizer.SPECIAL_TOKENS_ATTRIBUTES)
+
     def test_model_input_names_signature(self):
         accepted_model_main_input_names = [
             "input_ids",  # nlp models
@@ -1086,6 +1102,33 @@ def test_chat_template_batched(self):
             dummy_conversations, chat_template=dummy_template, tokenize=True
         )  # Check that no error raised
 
+    @require_jinja
+    def test_chat_template_content_none(self):
+        """Regression test: content=None (e.g. OpenAI tool-call messages) should be treated the same as missing content."""
+        dummy_template = (
+            "{% for message in messages %}"
+            "{{ message['role'] }}"
+            "{% if message.content is defined %}: {{ message['content'] }}{% endif %}"
+            "\n"
+            "{% endfor %}"
+        )
+        messages_with_none = [
+            {"role": "user", "content": "What is the weather?"},
+            {"role": "assistant", "content": None},
+        ]
+        messages_without_content = [
+            {"role": "user", "content": "What is the weather?"},
+            {"role": "assistant"},
+        ]
+        tokenizer = self.get_tokenizer()
+        output_none = tokenizer.apply_chat_template(
+            messages_with_none, chat_template=dummy_template, tokenize=False, return_dict=False
+        )
+        output_missing = tokenizer.apply_chat_template(
+            messages_without_content, chat_template=dummy_template, tokenize=False, return_dict=False
+        )
+        self.assertEqual(output_none, output_missing)
+
     @require_jinja
     def test_jinja_loopcontrols(self):
         break_template = """
diff --git a/tests/test_video_processing_common.py b/tests/test_video_processing_common.py
index 87e4abb1b513..b495b70b18a3 100644
--- a/tests/test_video_processing_common.py
+++ b/tests/test_video_processing_common.py
@@ -54,7 +54,7 @@ def prepare_video(num_frames, num_channels, width=10, height=10, return_tensors=
         video = [Image.fromarray(frame) for frame in video]
     elif return_tensors == "torch":
         # Torch images are typically in channels first format
-        video = torch.tensor(video).permute(0, 3, 1, 2)
+        video = torch.from_numpy(np.array(video)).permute(0, 3, 1, 2)
     elif return_tensors == "np":
         # Numpy images are typically in channels last format
         video = np.array(video)
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index 1b91903efec7..ba09145645ab 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -373,3 +373,27 @@ def test_import_protobuf_decode_error_does_not_mask_exceptions(self):
                     raise ValueError("real error")
                 except import_protobuf_decode_error():
                     pass
+
+    @require_sentencepiece
+    @require_tokenizers
+    @slow
+    def test_mask_token_no_duplicate_registration(self):
+        from transformers import BigBirdTokenizer
+
+        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+
+        # Check that tokenizing "Hello [MASK] world" does not produce '_' artifacts
+        tokens_single = tokenizer.tokenize("Hello [MASK] world")
+        self.assertNotIn(
+            "▁",
+            tokens_single,
+            f"Tokenization of 'Hello [MASK] world' should not produce '▁' tokens. Got: {tokens_single}",
+        )
+
+        # Check that tokenizing "[MASK] [MASK] [MASK]" does not produce '_' artifacts
+        tokens_multiple = tokenizer.tokenize("[MASK] [MASK] [MASK]")
+        self.assertNotIn(
+            "▁",
+            tokens_multiple,
+            f"Tokenization of '[MASK] [MASK] [MASK]' should not produce '▁' tokens. Got: {tokens_multiple}",
+        )
diff --git a/tests/trainer/distributed/test_trainer_distributed_deepspeed.py b/tests/trainer/distributed/test_trainer_distributed_deepspeed.py
index 6a0b3c49160e..f291e8dd90ef 100644
--- a/tests/trainer/distributed/test_trainer_distributed_deepspeed.py
+++ b/tests/trainer/distributed/test_trainer_distributed_deepspeed.py
@@ -924,6 +924,40 @@ def test_load_best_model(self, stage):
             trainer.train()
             trainer.evaluate()
 
+    @parameterized.expand(stages, name_func=_parameterized_custom_name_func)
+    def test_evaluate_before_train(self, stage):
+        """evaluate() before train() should work for all ZeRO stages."""
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(
+                deepspeed=self.get_config_dict(stage),
+                bf16=True,
+                output_dir=self.get_auto_remove_tmp_dir(),
+            )
+            trainer.evaluate()
+            trainer.train()
+
+    def test_config_preserved_after_evaluate(self):
+        """DS optimizer config and scheduler auto values should survive evaluate()."""
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(
+                deepspeed=self.get_config_dict(ZERO3),
+                bf16=True,
+                output_dir=self.get_auto_remove_tmp_dir(),
+            )
+            live_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config.config
+            self.assertIn("optimizer", live_config)
+            sched_total = live_config.get("scheduler", {}).get("params", {}).get("total_num_steps")
+
+            trainer.evaluate()
+
+            self.assertIn("optimizer", live_config, "optimizer config permanently deleted by evaluate()")
+            if sched_total == "auto":
+                self.assertEqual(
+                    live_config["scheduler"]["params"]["total_num_steps"],
+                    "auto",
+                    "scheduler total_num_steps 'auto' was replaced with 0 by evaluate()",
+                )
+
     @require_optuna
     def test_hyperparameter_search(self):
         """Run Optuna hyperparameter search with DeepSpeed ZeRO-3."""
diff --git a/tests/trainer/test_data_collator.py b/tests/trainer/test_data_collator.py
index 9a955a39afcc..eabed5cab079 100644
--- a/tests/trainer/test_data_collator.py
+++ b/tests/trainer/test_data_collator.py
@@ -302,7 +302,7 @@ def test_basic_flattening(self):
         self.assertEqual(batch["position_ids"][0].tolist(), [0, 1, 2, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6])
 
         # Should not include attention_mask or flash attn kwargs by default
-        for key in ["attention_mask", "cu_seq_lens_k", "cu_seq_lens_q", "seq_idx"]:
+        for key in ["attention_mask", "cu_seqlens", "cu_seq_lens_k", "cu_seq_lens_q", "seq_idx"]:
             self.assertNotIn(key, batch)
 
     def test_flash_attn_kwargs(self):
@@ -315,6 +315,13 @@ def test_flash_attn_kwargs(self):
         self.assertEqual(batch["max_length_k"], 7)
         self.assertEqual(batch["max_length_q"], 7)
 
+    def test_cu_seqlens(self):
+        """Test flattening with cu_seqlens for FLA-style kernels."""
+        collator = DataCollatorWithFlattening(return_tensors="pt", return_cu_seqlens=True)
+        batch = collator(self._get_features())
+
+        self.assertEqual(batch["cu_seqlens"].tolist(), [0, 3, 9, 16])
+
     def test_seq_idx(self):
         """Test flattening with seq_idx for sequence identification."""
         collator = DataCollatorWithFlattening(return_tensors="pt", return_seq_idx=True)
@@ -357,8 +364,16 @@ def test_numpy_flash_attn_kwargs(self):
         batch = collator(self._get_features())
 
         self.assertEqual(batch["cu_seq_lens_k"].tolist(), [0, 3, 9, 16])
+        self.assertEqual(batch["cu_seq_lens_q"].tolist(), [0, 3, 9, 16])
         self.assertEqual(batch["max_length_k"], 7)
 
+    def test_numpy_cu_seqlens(self):
+        """Test flattening with cu_seqlens and NumPy output."""
+        collator = DataCollatorWithFlattening(return_tensors="np", return_cu_seqlens=True)
+        batch = collator(self._get_features())
+
+        self.assertEqual(batch["cu_seqlens"].tolist(), [0, 3, 9, 16])
+
     def test_immutability(self):
         """Test that flattening does not mutate input data."""
         for return_tensors in ["pt", "np"]:
@@ -1203,3 +1218,22 @@ def test_immutability(self):
         for return_tensors in ["pt", "np"]:
             collator = DataCollatorForLanguageModeling(tokenizer, return_tensors=return_tensors)
             self._check_immutability(collator, self._get_features())
+
+
+@require_torch
+class TestDataCollatorWithFlatteningIntegerLabels(DataCollatorTestMixin, unittest.TestCase):
+    def test_integer_labels_are_broadcast(self):
+        from transformers import DataCollatorWithFlattening
+
+        features = [
+            {"input_ids": [0, 1, 2, 3], "labels": 1},
+            {"input_ids": [4, 5, 6], "labels": 0},
+        ]
+
+        collator = DataCollatorWithFlattening(return_tensors="pt")
+        batch = collator(features)
+
+        expected_input_ids = torch.tensor([[0, 1, 2, 3, 4, 5, 6]])
+        expected_labels = torch.tensor([[1, 1, 1, 1, 0, 0, 0]])
+        self.assertTrue(torch.equal(batch["input_ids"], expected_input_ids))
+        self.assertTrue(torch.equal(batch["labels"], expected_labels))
diff --git a/tests/trainer/test_per_sample_nested.py b/tests/trainer/test_per_sample_nested.py
new file mode 100644
index 000000000000..99af3939657f
--- /dev/null
+++ b/tests/trainer/test_per_sample_nested.py
@@ -0,0 +1,231 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tests for per-sample nested structure handling in trainer_pt_utils.
+Fixes issue #43388: gather_for_metrics incorrectly truncates Mask2Former-style labels.
+"""
+
+import unittest
+
+import numpy as np
+import torch
+
+from transformers.trainer_pt_utils import (
+    flatten_per_sample_nested_batches,
+    is_per_sample_nested,
+)
+
+
+class TestIsPerSampleNested(unittest.TestCase):
+    """Tests for is_per_sample_nested function."""
+
+    def test_tuple_of_lists_of_tensors(self):
+        """Tuple of lists of tensors should be detected."""
+        labels = ([torch.randn(5, 64), torch.randn(3, 64)], [torch.arange(5), torch.arange(3)])
+        self.assertTrue(is_per_sample_nested(labels))
+
+    def test_tuple_of_lists_of_numpy(self):
+        """Tuple of lists of numpy arrays should be detected."""
+        labels = ([np.random.randn(5, 64), np.random.randn(3, 64)], [np.arange(5), np.arange(3)])
+        self.assertTrue(is_per_sample_nested(labels))
+
+    def test_single_tensor(self):
+        """Single tensor should not be detected."""
+        self.assertFalse(is_per_sample_nested(torch.randn(10, 64)))
+
+    def test_tuple_of_tensors(self):
+        """Tuple of tensors (not lists) should not be detected."""
+        self.assertFalse(is_per_sample_nested((torch.randn(10, 64), torch.randn(10, 32))))
+
+    def test_empty_tuple(self):
+        """Empty tuple should not be detected."""
+        self.assertFalse(is_per_sample_nested(()))
+
+    def test_list_not_tuple(self):
+        """List (not tuple) should not be detected."""
+        self.assertFalse(is_per_sample_nested([[torch.randn(5, 64)], [torch.arange(5)]]))
+
+
+class TestFlattenPerSampleNestedBatches(unittest.TestCase):
+    """Tests for flatten_per_sample_nested_batches function."""
+
+    def test_flatten_multiple_batches(self):
+        """Should flatten multiple batches and truncate."""
+        batches = [
+            ([torch.randn(5, 64), torch.randn(3, 64)], [torch.arange(5), torch.arange(3)]),
+            ([torch.randn(7, 64), torch.randn(4, 64)], [torch.arange(7), torch.arange(4)]),
+            ([torch.randn(2, 64)], [torch.arange(2)]),
+        ]
+
+        result = flatten_per_sample_nested_batches(batches, num_samples=5)
+
+        self.assertEqual(len(result), 2)  # Two label types
+        self.assertEqual(len(result[0]), 5)  # 5 images (truncated from 5)
+        self.assertEqual(len(result[1]), 5)
+
+    def test_flatten_preserves_shapes(self):
+        """Should preserve individual tensor shapes."""
+        batches = [
+            ([torch.randn(5, 256, 256), torch.randn(3, 256, 256)], [torch.arange(5), torch.arange(3)]),
+            ([torch.randn(7, 256, 256)], [torch.arange(7)]),
+        ]
+
+        result = flatten_per_sample_nested_batches(batches, num_samples=3)
+
+        self.assertEqual(result[0][0].shape, torch.Size([5, 256, 256]))
+        self.assertEqual(result[0][1].shape, torch.Size([3, 256, 256]))
+        self.assertEqual(result[0][2].shape, torch.Size([7, 256, 256]))
+
+    def test_truncate_to_one(self):
+        """Should handle truncation to 1 sample (remainder=1 scenario)."""
+        batches = [([torch.randn(3, 64)], [torch.arange(3)])]
+
+        result = flatten_per_sample_nested_batches(batches, num_samples=1)
+
+        self.assertEqual(len(result), 2)  # Both label types preserved
+        self.assertEqual(len(result[0]), 1)
+        self.assertEqual(len(result[1]), 1)
+
+    def test_empty_batches(self):
+        """Should return None for empty batches."""
+        self.assertIsNone(flatten_per_sample_nested_batches([], num_samples=5))
+
+
+class TestMask2FormerScenario(unittest.TestCase):
+    """End-to-end test simulating Mask2Former evaluation."""
+
+    def test_full_evaluation_scenario(self):
+        """Simulate full evaluation with multiple batches."""
+        # 3 batches: 2+2+1 = 5 images, but dataset has 4 images
+        batches = [
+            (
+                [torch.randn(5, 256, 256), torch.randn(3, 256, 256)],
+                [torch.randint(0, 10, (5,)), torch.randint(0, 10, (3,))],
+            ),
+            (
+                [torch.randn(7, 256, 256), torch.randn(4, 256, 256)],
+                [torch.randint(0, 10, (7,)), torch.randint(0, 10, (4,))],
+            ),
+            ([torch.randn(2, 256, 256)], [torch.randint(0, 10, (2,))]),
+        ]
+
+        # Simulate what Trainer does
+        result = flatten_per_sample_nested_batches(batches, num_samples=4)
+
+        # Should have 4 images
+        self.assertEqual(len(result[0]), 4)
+        self.assertEqual(len(result[1]), 4)
+
+        # Instance counts should be preserved
+        self.assertEqual(result[0][0].shape[0], 5)  # First image: 5 instances
+        self.assertEqual(result[0][1].shape[0], 3)  # Second image: 3 instances
+        self.assertEqual(result[0][2].shape[0], 7)  # Third image: 7 instances
+        self.assertEqual(result[0][3].shape[0], 4)  # Fourth image: 4 instances
+
+
+class TestDistributedScenario(unittest.TestCase):
+    """Test simulating distributed training with gather_object."""
+
+    def test_distributed_gather_simulation(self):
+        """
+        Simulate distributed evaluation where gather_object returns
+        list of labels from each GPU process.
+
+        In distributed setup:
+        - GPU0 processes images 0, 2, 4, ...
+        - GPU1 processes images 1, 3, 5, ...
+        - gather_object returns [labels_gpu0, labels_gpu1, ...]
+        """
+        # Simulate 2 GPUs, each processing 2 images per batch
+        # GPU0's batch
+        gpu0_labels = (
+            [torch.randn(5, 256, 256), torch.randn(3, 256, 256)],
+            [torch.randint(0, 10, (5,)), torch.randint(0, 10, (3,))],
+        )
+        # GPU1's batch
+        gpu1_labels = (
+            [torch.randn(7, 256, 256), torch.randn(4, 256, 256)],
+            [torch.randint(0, 10, (7,)), torch.randint(0, 10, (4,))],
+        )
+
+        # gather_object returns list of labels from each process
+        gathered = [gpu0_labels, gpu1_labels]
+
+        # Simulate Trainer accumulation: extend (not append)
+        per_sample_nested_labels = []
+        per_sample_nested_labels.extend(gathered)
+
+        # flatten_per_sample_nested_batches handles this correctly
+        result = flatten_per_sample_nested_batches(per_sample_nested_labels, num_samples=4)
+
+        # Should have 4 images total (2 from each GPU)
+        self.assertEqual(len(result[0]), 4)
+        self.assertEqual(len(result[1]), 4)
+
+        # Instance counts should be preserved
+        self.assertEqual(result[0][0].shape[0], 5)  # GPU0 image 1
+        self.assertEqual(result[0][1].shape[0], 3)  # GPU0 image 2
+        self.assertEqual(result[0][2].shape[0], 7)  # GPU1 image 1
+        self.assertEqual(result[0][3].shape[0], 4)  # GPU1 image 2
+
+    def test_distributed_multiple_iterations(self):
+        """Test multiple evaluation iterations in distributed setup."""
+        per_sample_nested_labels = []
+
+        # Iteration 1: gather_object returns labels from 2 GPUs
+        iter1_gathered = [
+            ([torch.randn(5, 64), torch.randn(3, 64)], [torch.arange(5), torch.arange(3)]),  # GPU0
+            ([torch.randn(7, 64), torch.randn(4, 64)], [torch.arange(7), torch.arange(4)]),  # GPU1
+        ]
+        per_sample_nested_labels.extend(iter1_gathered)
+
+        # Iteration 2: another batch from 2 GPUs
+        iter2_gathered = [
+            ([torch.randn(2, 64)], [torch.arange(2)]),  # GPU0
+            ([torch.randn(6, 64)], [torch.arange(6)]),  # GPU1
+        ]
+        per_sample_nested_labels.extend(iter2_gathered)
+
+        # Total: 4 batches (2 GPUs x 2 iterations), 6 images
+        # Dataset has 5 images, so truncate to 5
+        result = flatten_per_sample_nested_batches(per_sample_nested_labels, num_samples=5)
+
+        self.assertEqual(len(result[0]), 5)
+        self.assertEqual(len(result[1]), 5)
+
+    def test_distributed_remainder_one(self):
+        """
+        Test the critical remainder=1 scenario in distributed setup.
+        This was causing class_labels to be completely lost before the fix.
+        """
+        # Single image split across processes (edge case)
+        gathered = [
+            ([torch.randn(3, 64)], [torch.arange(3)]),  # GPU0: 1 image
+        ]
+
+        per_sample_nested_labels = []
+        per_sample_nested_labels.extend(gathered)
+
+        result = flatten_per_sample_nested_batches(per_sample_nested_labels, num_samples=1)
+
+        # Both label types should be preserved
+        self.assertEqual(len(result), 2)
+        self.assertEqual(len(result[0]), 1)
+        self.assertEqual(len(result[1]), 1)
+        # Instance count preserved
+        self.assertEqual(result[0][0].shape[0], 3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/trainer/test_skip_logits_eval.py b/tests/trainer/test_skip_logits_eval.py
new file mode 100644
index 000000000000..bd9e2df7b72b
--- /dev/null
+++ b/tests/trainer/test_skip_logits_eval.py
@@ -0,0 +1,94 @@
+import tempfile
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset
+
+from transformers import Trainer, TrainingArguments
+
+
+class TinyDataset(Dataset):
+    def __len__(self):
+        return 4
+
+    def __getitem__(self, idx):
+        return {
+            "input_ids": torch.tensor([idx, idx + 1, idx + 2], dtype=torch.long),
+            "labels": torch.tensor([1], dtype=torch.long),
+        }
+
+
+class ModelWithSkipLogits(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.called_with_skip_logits = False
+
+    def forward(self, input_ids=None, labels=None, skip_logits=None, **kwargs):
+        assert skip_logits is True
+        self.called_with_skip_logits = True
+        return {"loss": torch.tensor(0.0)}
+
+
+def test_trainer_sets_skip_logits_for_loss_only_eval_when_liger_enabled():
+    with torch.random.fork_rng(devices=[]):
+        torch.manual_seed(0)
+
+        model = ModelWithSkipLogits()
+        ds = TinyDataset()
+
+        with tempfile.TemporaryDirectory() as tmp:
+            args = TrainingArguments(
+                output_dir=tmp,
+                per_device_eval_batch_size=2,
+                do_train=False,
+                do_eval=True,
+                prediction_loss_only=True,
+                use_liger_kernel=True,
+                report_to=[],
+                disable_tqdm=True,
+            )
+            trainer = Trainer(model=model, args=args, eval_dataset=ds)
+            trainer.evaluate()
+
+        assert model.called_with_skip_logits is True
+
+
+class ReturnLossNoLabelsModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.seen_skip_logits = []
+
+    def forward(self, input_ids=None, return_loss=None, skip_logits=None, **kwargs):
+        self.seen_skip_logits.append(skip_logits)
+        if return_loss:
+            return {"loss": torch.tensor(0.0)}
+        return {"logits": torch.zeros((input_ids.shape[0], 2))}
+
+
+def test_trainer_does_not_set_skip_logits_when_no_labels_but_return_loss_true():
+    with torch.random.fork_rng(devices=[]):
+        torch.manual_seed(0)
+
+        model = ReturnLossNoLabelsModel()
+
+        with tempfile.TemporaryDirectory() as tmp:
+            args = TrainingArguments(
+                output_dir=tmp,
+                per_device_eval_batch_size=2,
+                do_train=False,
+                do_eval=True,
+                prediction_loss_only=True,
+                use_liger_kernel=True,
+                report_to=[],
+                disable_tqdm=True,
+            )
+            trainer = Trainer(model=model, args=args)
+
+            trainer.label_names = []
+            trainer.prediction_step(
+                trainer.model,
+                {"input_ids": torch.tensor([[1, 2, 3]], dtype=torch.long), "return_loss": True},
+                prediction_loss_only=True,
+            )
+
+        assert model.seen_skip_logits[-1] is None
diff --git a/tests/trainer/test_training_args.py b/tests/trainer/test_training_args.py
index 1864b8a46d4d..58b886bb80df 100644
--- a/tests/trainer/test_training_args.py
+++ b/tests/trainer/test_training_args.py
@@ -404,3 +404,24 @@ class TorchDtypeTrainingArguments(TrainingArguments):
                 args_dict = args.to_dict()
                 self.assertIn("dtype", args_dict)
                 self.assertEqual(args_dict["dtype"], dtype)
+
+    def test_batch_size_respects_split_batches(self):
+        """Test that train_batch_size and eval_batch_size respect split_batches config."""
+        # Default behavior: split_batches=False
+        args = TrainingArguments(
+            output_dir="./test",
+            per_device_train_batch_size=8,
+            per_device_eval_batch_size=4,
+        )
+        self.assertFalse(args.accelerator_config.split_batches)
+
+        # With split_batches=True, batch size should not be multiplied by n_gpu
+        args_split = TrainingArguments(
+            output_dir="./test",
+            per_device_train_batch_size=8,
+            per_device_eval_batch_size=4,
+            accelerator_config={"split_batches": True},
+        )
+        self.assertTrue(args_split.accelerator_config.split_batches)
+        self.assertEqual(args_split.train_batch_size, args_split.per_device_train_batch_size)
+        self.assertEqual(args_split.eval_batch_size, args_split.per_device_eval_batch_size)
diff --git a/tests/utils/test_auto_docstring.py b/tests/utils/test_auto_docstring.py
index a38d2fc3f62d..698c78644132 100644
--- a/tests/utils/test_auto_docstring.py
+++ b/tests/utils/test_auto_docstring.py
@@ -16,6 +16,7 @@
 """
 
 import importlib
+import inspect
 import os
 import statistics
 import sys
@@ -24,6 +25,7 @@
 import time
 import unittest
 from pathlib import Path
+from typing import Optional
 
 import torch
 from huggingface_hub.dataclasses import strict
@@ -38,6 +40,7 @@
 from transformers.testing_utils import require_torch
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 from transformers.utils.auto_docstring import (
+    _process_kwargs_parameters,
     auto_docstring,
 )
 from transformers.utils.import_utils import is_torch_available
@@ -669,6 +672,56 @@ def test_dummy_image_processor_complete_docstring(self):
 
         self.assertEqual(actual_class_docstring, expected_class_docstring)
 
+    def test_process_kwargs_parameters_with_string_annotations(self):
+        """Test that _process_kwargs_parameters handles string annotations from `from __future__ import annotations`.
+
+        `from __future__ import annotations` makes all annotations strings at runtime.
+        _process_kwargs_parameters must resolve them via get_type_hints() rather than
+        crashing when accessing annotation.__args__.
+
+        See: https://github.com/huggingface/transformers/issues/45103
+        """
+
+        # Case 1: string annotation that resolves successfully via get_type_hints().
+        # Inject CustomKwargs and Optional into the function's globals so get_type_hints() can find them.
+        # (get_type_hints resolves against func.__globals__, i.e. the module scope, not the local test scope.)
+        class CustomKwargs:
+            """
+            Custom kwargs.
+
+            Args:
+                image_size (`int`):
+                    Size of the image.
+            """
+
+            image_size: int = 224
+
+        def func_with_string_annotation(self, **kwargs):
+            pass
+
+        func_with_string_annotation.__annotations__["kwargs"] = "Optional[CustomKwargs]"
+        func_with_string_annotation.__globals__["CustomKwargs"] = CustomKwargs
+        func_with_string_annotation.__globals__["Optional"] = Optional
+
+        sig = inspect.signature(func_with_string_annotation)
+        self.assertIsInstance(sig.parameters["kwargs"].annotation, str)  # confirm string at runtime
+
+        docstring, summary = _process_kwargs_parameters(sig, func_with_string_annotation, ProcessorMixin, {}, 4, [])
+        self.assertIn("image_size", docstring, "Expected resolved kwargs docstring to include 'image_size'")
+
+        # Case 2: string annotation that cannot be resolved — must skip gracefully, not crash.
+        def func_with_unresolvable_annotation(self, **kwargs):
+            pass
+
+        func_with_unresolvable_annotation.__annotations__["kwargs"] = "UnresolvableTypeXYZ"
+
+        sig2 = inspect.signature(func_with_unresolvable_annotation)
+        docstring2, summary2 = _process_kwargs_parameters(
+            sig2, func_with_unresolvable_annotation, ProcessorMixin, {}, 4, []
+        )
+        self.assertEqual(docstring2, "", "Expected empty docstring when annotation cannot be resolved")
+        self.assertEqual(summary2, "", "Expected empty summary when annotation cannot be resolved")
+
 
 # ---------------------------------------------------------------------------
 # Performance tests for auto_docstring
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index a27ced73018f..1a588d9ef4cf 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -16,20 +16,17 @@
 
 import pytest
 
-from transformers import DetrConfig, MaskFormerConfig, PreTrainedConfig, ResNetBackbone, ResNetConfig, TimmBackbone
+from transformers import PreTrainedConfig
 from transformers.backbone_utils import (
     BackboneConfigMixin,
     BackboneMixin,
-    load_backbone,
 )
-from transformers.testing_utils import require_torch, slow
+from transformers.testing_utils import require_torch
 from transformers.utils.import_utils import is_torch_available
 
 
 if is_torch_available():
-    import torch
-
-    from transformers import BertPreTrainedModel, PreTrainedModel
+    from transformers import PreTrainedModel
 
 
 class AnyBackboneConfig(BackboneConfigMixin, PreTrainedConfig):
@@ -152,134 +149,3 @@ def test_backbone_mixin(self):
         backbone.out_indices = [-3, -1]
         self.assertEqual(backbone.out_features, ["a", "c"])
         self.assertEqual(backbone.out_indices, [-3, -1])
-
-    @slow
-    @require_torch
-    def test_load_backbone_from_config(self):
-        """
-        Test that load_backbone correctly loads a backbone from a backbone config.
-        """
-        config = MaskFormerConfig(backbone_config=ResNetConfig(out_indices=(0, 2)))
-        backbone = load_backbone(config)
-        self.assertEqual(backbone.out_features, ["stem", "stage2"])
-        self.assertEqual(backbone.out_indices, (0, 2))
-        self.assertIsInstance(backbone, ResNetBackbone)
-
-    @slow
-    @require_torch
-    def test_load_backbone_from_checkpoint(self):
-        """
-        Test that load_backbone correctly loads a backbone from a checkpoint.
-        """
-        config = MaskFormerConfig(backbone="microsoft/resnet-18", backbone_config=None)
-        backbone = load_backbone(config)
-        self.assertEqual(backbone.out_indices, [4])
-        self.assertEqual(backbone.out_features, ["stage4"])
-        self.assertIsInstance(backbone, ResNetBackbone)
-
-        config = MaskFormerConfig(
-            backbone="resnet18",
-            use_timm_backbone=True,
-        )
-        backbone = load_backbone(config)
-        # We can't know ahead of time the exact output features and indices, or the layer names before
-        # creating the timm model, so it defaults to the last layer (-1,) and has a different layer name
-        self.assertEqual(backbone.out_indices, (-1,))
-        self.assertEqual(backbone.out_features, ["layer4"])
-        self.assertIsInstance(backbone, TimmBackbone)
-
-    @slow
-    @require_torch
-    def test_load_backbone_backbone_kwargs(self):
-        """
-        Test that load_backbone correctly configures the loaded backbone with the provided kwargs.
-        """
-        config = MaskFormerConfig(backbone="resnet18", use_timm_backbone=True, backbone_kwargs={"out_indices": (0, 1)})
-        backbone = load_backbone(config)
-        self.assertEqual(backbone.out_indices, (0, 1))
-        self.assertIsInstance(backbone, TimmBackbone)
-
-        config = MaskFormerConfig(backbone="microsoft/resnet-18", backbone_kwargs={"out_indices": (0, 2)})
-        backbone = load_backbone(config)
-        self.assertEqual(backbone.out_indices, (0, 2))
-        self.assertIsInstance(backbone, ResNetBackbone)
-
-        # Check can't be passed with a backone config
-        with pytest.raises(ValueError):
-            config = MaskFormerConfig(
-                backbone="microsoft/resnet-18",
-                backbone_config=ResNetConfig(out_indices=(0, 2)),
-                backbone_kwargs={"out_indices": (0, 1)},
-            )
-
-    @slow
-    @require_torch
-    def test_load_backbone_in_new_model(self):
-        """
-        Tests that new model can be created, with its weights instantiated and pretrained backbone weights loaded.
-        """
-
-        # Inherit from PreTrainedModel to ensure that the weights are initialized
-        class NewModel(BertPreTrainedModel):
-            def __init__(self, config):
-                super().__init__(config)
-                self.backbone = load_backbone(config)
-                self.layer_0 = torch.nn.Linear(config.hidden_size, config.hidden_size)
-                self.layer_1 = torch.nn.Linear(config.hidden_size, config.hidden_size)
-
-        def get_equal_not_equal_weights(model_0, model_1):
-            equal_weights = []
-            not_equal_weights = []
-            for (k0, v0), (k1, v1) in zip(model_0.named_parameters(), model_1.named_parameters()):
-                self.assertEqual(k0, k1)
-                weights_are_equal = torch.allclose(v0, v1)
-                if weights_are_equal:
-                    equal_weights.append(k0)
-                else:
-                    not_equal_weights.append(k0)
-            return equal_weights, not_equal_weights
-
-        config = MaskFormerConfig(use_pretrained_backbone=False, backbone="microsoft/resnet-18")
-        model_0 = NewModel(config)
-        model_1 = NewModel(config)
-        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
-
-        # Norm layers are always initialized with the same weights
-        equal_weights = [w for w in equal_weights if "normalization" not in w]
-        self.assertEqual(len(equal_weights), 0)
-        self.assertEqual(len(not_equal_weights), 24)
-
-        # Now we create a new model with backbone weights that are pretrained
-        config.use_pretrained_backbone = True
-        model_0 = NewModel(config)
-        model_1 = NewModel(config)
-        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
-
-        # Norm layers are always initialized with the same weights
-        equal_weights = [w for w in equal_weights if "normalization" not in w]
-        self.assertEqual(len(equal_weights), 20)
-        # Linear layers are still initialized randomly
-        self.assertEqual(len(not_equal_weights), 4)
-
-        # Check loading in timm backbone
-        config = DetrConfig(use_pretrained_backbone=False, backbone="resnet18", use_timm_backbone=True)
-        model_0 = NewModel(config)
-        model_1 = NewModel(config)
-        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
-
-        # Norm layers are always initialized with the same weights
-        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
-        self.assertEqual(len(equal_weights), 0)
-        self.assertEqual(len(not_equal_weights), 24)
-
-        # Now we create a new model with backbone weights that are pretrained
-        config.use_pretrained_backbone = True
-        model_0 = NewModel(config)
-        model_1 = NewModel(config)
-        equal_weights, not_equal_weights = get_equal_not_equal_weights(model_0, model_1)
-
-        # Norm layers are always initialized with the same weights
-        equal_weights = [w for w in equal_weights if "bn" not in w and "downsample.1" not in w]
-        self.assertEqual(len(equal_weights), 20)
-        # Linear layers are still initialized randomly
-        self.assertEqual(len(not_equal_weights), 4)
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index 769639e5f612..eb65cdd0183e 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -881,6 +881,28 @@ def test_static_cache(self):
             static_cache.layers[0].keys[0, 0, :, 0].tolist(), [1.0, 2.0, 3.0, 4.0], "StaticCache Scenario 2 failed"
         )
 
+    def test_static_cache_type_checks(self):
+        """Test that StaticCache validates offloading types and unknown kwargs."""
+        cache = StaticCache(
+            config=self.config, max_cache_len=self.max_cache_len, offloading=True, offload_only_non_sliding=False
+        )
+        self.assertIsInstance(cache, StaticCache)
+
+        # Passing wrong type for offloading should raise TypeError
+        with self.assertRaises(TypeError) as cm:
+            StaticCache(config=self.config, max_cache_len=self.max_cache_len, offloading="cuda:0")
+        self.assertIn("`offloading` must be a bool", str(cm.exception))
+
+        # Passing wrong type for offload_only_non_sliding should raise TypeError
+        with self.assertRaises(TypeError) as cm:
+            StaticCache(config=self.config, max_cache_len=self.max_cache_len, offload_only_non_sliding=1)
+        self.assertIn("`offload_only_non_sliding` must be a bool", str(cm.exception))
+
+        # Passing unknown kwargs should raise TypeError
+        with self.assertRaises(TypeError) as cm:
+            StaticCache(config=self.config, max_cache_len=self.max_cache_len, foo="bar")
+        self.assertIn("Unknown arguments passed to StaticCache", str(cm.exception))
+
     def test_sliding_window_cache(self):
         """Test fully sliding StaticCache with manually prefilled states and hardcoded assertions.
 
@@ -1233,3 +1255,21 @@ def test_hybrid_chunked_cache_extra_cases(self):
 
         self.assertEqual(cache.layers[0].keys[0, 0, :, 0].tolist(), [20.0, 30.0, 40.0])
         self.assertEqual(returned_1[0][0, 0, :, 0].tolist(), [10.0, 20.0, 30.0, 40.0])
+
+    def test_quantized_cache_reset(self):
+        """Test that reset clears quantized data between generations."""
+        if not is_optimum_quanto_available():
+            self.skipTest("quanto is not available")
+        from transformers.cache_utils import QuantoQuantizedLayer
+
+        layer = QuantoQuantizedLayer(nbits=4, residual_length=2, q_group_size=16)
+        k1 = torch.randn(1, 4, 4, 64)
+        v1 = torch.randn(1, 4, 4, 64)
+        layer.update(k1, v1)
+
+        layer.reset()
+
+        k2 = torch.randn(1, 4, 2, 64)
+        v2 = torch.randn(1, 4, 2, 64)
+        keys_out, _ = layer.update(k2, v2)
+        self.assertEqual(keys_out.shape[-2], 2, "Stale quantized data leaked through reset()")
diff --git a/tests/utils/test_core_model_loading.py b/tests/utils/test_core_model_loading.py
index a358822d19f8..b23a4c088c50 100644
--- a/tests/utils/test_core_model_loading.py
+++ b/tests/utils/test_core_model_loading.py
@@ -19,7 +19,11 @@
 import torch.nn as nn
 
 from transformers import PretrainedConfig
-from transformers.conversion_mapping import get_checkpoint_conversion_mapping, register_checkpoint_conversion_mapping
+from transformers.conversion_mapping import (
+    get_checkpoint_conversion_mapping,
+    get_model_conversion_mapping,
+    register_checkpoint_conversion_mapping,
+)
 from transformers.core_model_loading import (
     Chunk,
     Concatenate,
@@ -147,14 +151,14 @@ def test_sub_key_rewrites_targets(self):
         ]
 
         self.assertEqual(
-            rename_source_key("foo.block_sparse_moe.experts.3.w1.weight", renamings, [])[0],
+            rename_source_key("foo.block_sparse_moe.experts.3.w1.weight", renamings)[0],
             "foo.mlp.experts.gate_up_proj",
         )
         self.assertEqual(
-            rename_source_key("foo.block_sparse_moe.experts.3.w2.weight", renamings, [])[0],
+            rename_source_key("foo.block_sparse_moe.experts.3.w2.weight", renamings)[0],
             "foo.mlp.experts.down_proj",
         )
-        self.assertEqual(rename_source_key("model.language_model.lm_head.weight", renamings, [])[0], "language_model")
+        self.assertEqual(rename_source_key("model.language_model.lm_head.weight", renamings)[0], "language_model")
 
     def test_sub_key_no_match_returns_original(self):
         renamings = [
@@ -162,7 +166,7 @@ def test_sub_key_no_match_returns_original(self):
         ]
 
         key = "unrelated.key"
-        renamed_key, _ = rename_source_key(key, renamings, [])
+        renamed_key, _ = rename_source_key(key, renamings)
         self.assertEqual(renamed_key, key)
 
 
@@ -406,7 +410,7 @@ def test_moe_and_qkv_conversion_reversed(self):
 
     def test_qkv_chunk_rope_permute_with_fp8_quantization(self):
         if is_triton_available():
-            from transformers.integrations.finegrained_fp8 import Fp8Dequantize
+            from transformers.integrations.finegrained_fp8 import Fp8Dequantize, Fp8Quantize
         else:
             self.skipTest("Fine-grained FP8 integration tests require Triton to be installed.")
         n_heads = 2
@@ -472,6 +476,7 @@ def __init__(self):
                     self, "quantization_config", SimpleNamespace(weight_block_size=bs)
                 ),
                 "param_needs_quantization": lambda self, _model, param_name: param_name.endswith("q_proj.weight"),
+                "get_quantize_ops": lambda self: Fp8Quantize(self),
                 "pre_quantized": False,
             },
         )
@@ -479,11 +484,11 @@ def __init__(self):
 
         weight_mapping = [
             WeightConverter(
-                "model.layers.*.self_attn.qkv_proj.weight",
+                "self_attn.qkv_proj.weight",
                 [
-                    "model.layers.*.self_attn.q_proj.weight",
-                    "model.layers.*.self_attn.k_proj.weight",
-                    "model.layers.*.self_attn.v_proj.weight",
+                    "self_attn.q_proj.weight",
+                    "self_attn.k_proj.weight",
+                    "self_attn.v_proj.weight",
                 ],
                 operations=[Chunk(dim=0), PermuteForRope()],
             )
@@ -526,6 +531,138 @@ def __init__(self):
         )
         torch.testing.assert_close(dequantized_q, expected_q, rtol=1e-2, atol=1e-2)
 
+    def test_scoped_renaming_does_not_leak_to_sibling_keys(self):
+        """scope_prefix restricts a WeightRenaming to keys under that sub-prefix only.
+
+        A "^"-anchored pattern must match the suffix after stripping the prefix, and
+        must not fire at all on keys that do not start with the scope prefix.
+
+        Without scope_prefix, "^old_q" would rename *any* key beginning with "old_q"
+        at any nesting level — including root-level ones that belong to a different
+        part of the model.
+        """
+
+        class _Attn(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.q = DummyParamModule((1, 2))
+
+        class _Encoder(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attn = _Attn()
+
+        class _ScopedModel(nn.Module):
+            base_model_prefix = ""
+
+            def __init__(self):
+                super().__init__()
+                self.encoder = _Encoder()
+                self.q = DummyParamModule((1, 2))  # root-level q — must not be touched
+
+        model = _ScopedModel()
+        model.config = PretrainedConfig()
+
+        enc_val = torch.tensor([[1.0, 2.0]])
+        checkpoint = {
+            "encoder.attn.old_q.weight": enc_val.clone(),
+            "old_q.weight": torch.tensor([[9.0, 9.0]]),  # outside scope
+        }
+
+        scoped_rename = WeightRenaming("^old_q", "q")
+        scoped_rename.scope_prefix = "encoder.attn"
+
+        loading_info, _ = convert_and_load_state_dict_in_model(
+            model,
+            checkpoint,
+            LoadStateDictConfig(weight_mapping=[scoped_rename]),
+            tp_plan=None,
+        )
+
+        # The root-level "old_q.weight" must be unmatched (unexpected), not silently
+        # loaded into "q.weight".
+        self.assertEqual(loading_info.unexpected_keys, {"old_q.weight"})
+        self.assertEqual(loading_info.missing_keys, {"q.weight"})
+        self.assertEqual(loading_info.mismatched_keys, set())
+        self.assertEqual(loading_info.conversion_errors, {})
+
+        torch.testing.assert_close(model.encoder.attn.q.weight, enc_val)
+        # Root q.weight must still be its initialised zero value.
+        torch.testing.assert_close(model.q.weight, torch.zeros(1, 2))
+
+    def test_interleaved_renaming_and_converter_round_trip(self):
+        """A WeightRenaming preceding a WeightConverter in the list must fire in the
+        reverse (save) direction even after the converter has already set source_pattern.
+
+        Forward  [WeightRenaming, WeightConverter]:
+          "decoder.attn.qkv_proj.weight"
+          → WeightRenaming : "encoder.attn.qkv_proj.weight"
+          → WeightConverter: "encoder.attn.{q,k,v}_proj.weight"  (source_pattern set)
+
+        Reverse  [rev(WeightConverter), rev(WeightRenaming)]:
+          "encoder.attn.{q,k,v}_proj.weight"
+          → rev(WeightConverter): "encoder.attn.qkv_proj.weight"  (source_pattern set)
+          → rev(WeightRenaming) : "decoder.attn.qkv_proj.weight"  ← must still run!
+        """
+
+        class _Attn(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.q_proj = DummyParamModule((2, 4))
+                self.k_proj = DummyParamModule((2, 4))
+                self.v_proj = DummyParamModule((2, 4))
+
+        class _Encoder(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attn = _Attn()
+
+        class _InterleavedModel(nn.Module):
+            base_model_prefix = ""
+
+            def __init__(self):
+                super().__init__()
+                self.encoder = _Encoder()
+
+        qkv = torch.arange(24, dtype=torch.float32).reshape(6, 4)
+        model = _InterleavedModel()
+        model.config = PretrainedConfig()
+
+        # Checkpoint uses a "decoder" prefix and stores QKV packed together.
+        checkpoint = {"decoder.attn.qkv_proj.weight": qkv.clone()}
+
+        weight_mapping = [
+            WeightRenaming("^decoder", "encoder"),  # step 1: fix prefix
+            WeightConverter(  # step 2: unpack QKV (fires after rename)
+                "attn.qkv_proj.weight",
+                ["attn.q_proj.weight", "attn.k_proj.weight", "attn.v_proj.weight"],
+                operations=[Chunk(dim=0)],
+            ),
+        ]
+
+        loading_info, _ = convert_and_load_state_dict_in_model(
+            model,
+            checkpoint,
+            LoadStateDictConfig(weight_mapping=weight_mapping),
+            tp_plan=None,
+        )
+
+        self.assertEqual(loading_info.missing_keys, set())
+        self.assertEqual(loading_info.unexpected_keys, set())
+        self.assertEqual(loading_info.mismatched_keys, set())
+        self.assertEqual(loading_info.conversion_errors, {})
+
+        q, k, v = torch.chunk(qkv, 3, dim=0)
+        torch.testing.assert_close(model.encoder.attn.q_proj.weight, q)
+        torch.testing.assert_close(model.encoder.attn.k_proj.weight, k)
+        torch.testing.assert_close(model.encoder.attn.v_proj.weight, v)
+
+        # Round-trip: saving must reconstruct the original "decoder.*" checkpoint.
+        # This relies on rev(WeightRenaming) firing after rev(WeightConverter) has set
+        # source_pattern — if it were skipped the prefix would remain "encoder".
+        saved = revert_weight_conversion(model, model.state_dict())
+        self.assertTrue(compare_state_dicts(saved, checkpoint))
+
     def test_ernie4_5_vl_moe_conversion(self):
         model = DummyRoot(add_extra_moe=True)
         model.config = PretrainedConfig()
@@ -1020,6 +1157,31 @@ def test_can_add_prefix_submodule(self):
         for k, v in saved_state_dict.items():
             self.assertTrue((v == model_state_dict[k]).all())
 
+    def test_class_name_wins_over_model_type(self):
+        """Class-name registry entry takes priority over model_type for the same model."""
+        register_checkpoint_conversion_mapping("_TstCls", [WeightRenaming(r"^cls_key", "cls_renamed")], overwrite=True)
+        register_checkpoint_conversion_mapping(
+            "_tst_mtype", [WeightRenaming(r"^type_key", "type_renamed")], overwrite=True
+        )
+
+        def make_mock(class_name):
+            m = type(class_name, (), {})()
+            m.config = SimpleNamespace(model_type="_tst_mtype")
+            m._named_pretrained_submodules = [("", m)]
+            return m
+
+        # A module whose class name has a registry entry → class entry wins.
+        transforms = get_model_conversion_mapping(make_mock("_TstCls"), add_legacy=False)
+        patterns = [t.source_patterns for t in transforms]
+        self.assertIn(["^cls_key"], patterns)
+        self.assertNotIn(["^type_key"], patterns)
+
+        # A module with no class entry falls through to the model_type entry.
+        transforms = get_model_conversion_mapping(make_mock("_TstOther"), add_legacy=False)
+        patterns = [t.source_patterns for t in transforms]
+        self.assertIn(["^type_key"], patterns)
+        self.assertNotIn(["^cls_key"], patterns)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/utils/test_dynamic_module_utils.py b/tests/utils/test_dynamic_module_utils.py
index dfdc63460cd3..5d5e4f075f27 100644
--- a/tests/utils/test_dynamic_module_utils.py
+++ b/tests/utils/test_dynamic_module_utils.py
@@ -12,11 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib.util
 import os
+import sys
+import warnings
+from pathlib import Path
 
 import pytest
 
-from transformers.dynamic_module_utils import get_imports
+from transformers import AutoConfig, dynamic_module_utils
+from transformers.dynamic_module_utils import custom_object_save, get_cached_module_file, get_imports
 
 
 TOP_LEVEL_IMPORT = """
@@ -127,3 +132,122 @@ def test_import_parsing(tmp_path, case):
 
     parsed_imports = get_imports(tmp_file_path)
     assert parsed_imports == ["os"]
+
+
+def test_custom_object_save_destination_is_writable_when_source_is_readonly(tmp_path, monkeypatch):
+    # Regression test for https://github.com/huggingface/transformers/issues/45684:
+    # `custom_object_save` used `shutil.copy`, which preserves source mode bits, so
+    # a read-only source (e.g. a Perforce-managed file) produced a read-only copy
+    # in the saved-model directory.
+    src = tmp_path / "my_custom_module.py"
+    src.write_text("class CustomThing:\n    pass\n")
+
+    spec = importlib.util.spec_from_file_location("my_custom_module", src)
+    assert spec is not None
+    assert spec.loader is not None
+
+    module = importlib.util.module_from_spec(spec)
+    monkeypatch.setitem(sys.modules, "my_custom_module", module)
+    spec.loader.exec_module(module)
+
+    src.chmod(0o444)  # read-only source
+
+    out_dir = tmp_path / "out"
+    out_dir.mkdir()
+
+    custom_object_save(module.CustomThing, str(out_dir))
+
+    dest = out_dir / "my_custom_module.py"
+    assert dest.exists()
+    assert os.access(dest, os.W_OK), f"dest mode={oct(dest.stat().st_mode)} should be writable"
+
+
+def _create_local_module(module_dir: Path, module_code: str, helper_code: str | None = None):
+    module_dir.mkdir(parents=True, exist_ok=True)
+    (module_dir / "custom_model.py").write_text(module_code, encoding="utf-8")
+    if helper_code is not None:
+        (module_dir / "helper.py").write_text(helper_code, encoding="utf-8")
+
+
+def test_get_cached_module_file_local_cache_key_uses_basename_and_content_hash(monkeypatch, tmp_path):
+    modules_cache = tmp_path / "hf_modules_cache"
+    monkeypatch.setattr(dynamic_module_utils, "HF_MODULES_CACHE", str(modules_cache))
+
+    model_dir_a = tmp_path / "pretrained_a" / "subdir"
+    model_dir_b = tmp_path / "pretrained_b" / "subdir"
+    model_dir_c = tmp_path / "pretrained_c" / "subdir"
+
+    _create_local_module(model_dir_a, 'MAGIC = "A"\n')
+    _create_local_module(model_dir_b, 'MAGIC = "B"\n')
+    _create_local_module(model_dir_c, 'MAGIC = "A"\n')
+
+    cached_module_a = get_cached_module_file(str(model_dir_a), "custom_model.py")
+    cached_module_b = get_cached_module_file(str(model_dir_b), "custom_model.py")
+    cached_module_c = get_cached_module_file(str(model_dir_c), "custom_model.py")
+
+    cached_module_path_a = Path(cached_module_a)
+    assert cached_module_path_a.parent.parent.name == "subdir"
+    assert len(cached_module_path_a.parent.name) == 16
+    assert cached_module_a != cached_module_b
+    assert cached_module_a == cached_module_c
+
+
+def test_get_cached_module_file_local_cache_key_includes_relative_import_sources(monkeypatch, tmp_path):
+    modules_cache = tmp_path / "hf_modules_cache"
+    monkeypatch.setattr(dynamic_module_utils, "HF_MODULES_CACHE", str(modules_cache))
+
+    model_dir_a = tmp_path / "pretrained_a" / "subdir"
+    model_dir_b = tmp_path / "pretrained_b" / "subdir"
+
+    module_code = "from .helper import MAGIC\nVALUE = MAGIC\n"
+    _create_local_module(model_dir_a, module_code, 'MAGIC = "A"\n')
+    _create_local_module(model_dir_b, module_code, 'MAGIC = "B"\n')
+
+    cached_module_a = get_cached_module_file(str(model_dir_a), "custom_model.py")
+    cached_module_b = get_cached_module_file(str(model_dir_b), "custom_model.py")
+
+    cached_helper_a = modules_cache / Path(cached_module_a).parent / "helper.py"
+    cached_helper_b = modules_cache / Path(cached_module_b).parent / "helper.py"
+
+    assert cached_module_a != cached_module_b
+    assert cached_helper_a.read_text(encoding="utf-8") == 'MAGIC = "A"\n'
+    assert cached_helper_b.read_text(encoding="utf-8") == 'MAGIC = "B"\n'
+
+
+def test_get_cached_module_file_local_cache_key_keeps_hash_stable_with_different_basenames(monkeypatch, tmp_path):
+    modules_cache = tmp_path / "hf_modules_cache"
+    monkeypatch.setattr(dynamic_module_utils, "HF_MODULES_CACHE", str(modules_cache))
+
+    model_dir_a = tmp_path / "pretrained_a" / "alpha_subdir"
+    model_dir_b = tmp_path / "pretrained_b" / "beta_subdir"
+
+    _create_local_module(model_dir_a, 'MAGIC = "A"\n')
+    _create_local_module(model_dir_b, 'MAGIC = "A"\n')
+
+    cached_module_a = Path(get_cached_module_file(str(model_dir_a), "custom_model.py"))
+    cached_module_b = Path(get_cached_module_file(str(model_dir_b), "custom_model.py"))
+
+    assert cached_module_a.parent.parent.name == "alpha_subdir"
+    assert cached_module_b.parent.parent.name == "beta_subdir"
+    assert cached_module_a.parent.name == cached_module_b.parent.name
+
+
+def test_local_path_with_and_without_trailing_slash(tmp_path):
+    model_dir = tmp_path / "my_model"
+    model_dir.mkdir()
+    config_path = model_dir / "config.json"
+    config_path.write_text('{"model_type": "bert"}')
+    path_no_slash = str(model_dir)
+    path_with_slash = str(model_dir) + os.sep
+
+    with warnings.catch_warnings(record=True) as w1:
+        warnings.simplefilter("always")
+        cfg1 = AutoConfig.from_pretrained(path_no_slash)
+
+    with warnings.catch_warnings(record=True) as w2:
+        warnings.simplefilter("always")
+        cfg2 = AutoConfig.from_pretrained(path_with_slash)
+
+    assert isinstance(cfg1, type(cfg2))
+    assert len(w1) == 0
+    assert len(w2) == 0
diff --git a/tests/utils/test_feature_extraction_utils.py b/tests/utils/test_feature_extraction_utils.py
index 8291fd0e7462..9d3dce32f4ad 100644
--- a/tests/utils/test_feature_extraction_utils.py
+++ b/tests/utils/test_feature_extraction_utils.py
@@ -24,8 +24,15 @@
 
 from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
 from transformers.feature_extraction_utils import BatchFeature
-from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test, require_torch
-from transformers.utils import is_torch_available
+from transformers.testing_utils import (
+    TOKEN,
+    TemporaryHubRepo,
+    get_tests_dir,
+    is_staging_test,
+    require_mlx,
+    require_torch,
+)
+from transformers.utils import is_mlx_available, is_torch_available
 
 
 sys.path.append(str(Path(__file__).parent.parent.parent / "utils"))
@@ -36,6 +43,9 @@
 if is_torch_available():
     import torch
 
+if is_mlx_available():
+    import mlx.core as mx
+
 
 SAMPLE_FEATURE_EXTRACTION_CONFIG_DIR = get_tests_dir("fixtures")
 
@@ -116,6 +126,36 @@ def test_batch_feature_pytorch_conversion(self):
         self.assertIsInstance(batch_stacked["pixel_values"], torch.Tensor)
         self.assertEqual(batch_stacked["pixel_values"].shape, (3, 3, 10, 10))
 
+    @require_mlx
+    def test_batch_feature_mlx_conversion(self):
+        """Test conversion to MLX tensors from various input types."""
+        # From lists
+        batch = BatchFeature({"input_values": [[1, 2, 3], [4, 5, 6]]}, tensor_type="mlx")
+        self.assertIsInstance(batch["input_values"], mx.array)
+        self.assertEqual(batch["input_values"].shape, (2, 3))
+
+        # From MLX array (should be returned as-is)
+        mlx_data = mx.array([[1, 2, 3], [4, 5, 6]])
+        batch_mlx = BatchFeature({"input_values": mlx_data}, tensor_type="mlx")
+        np.testing.assert_array_equal(np.asarray(batch_mlx["input_values"]), np.asarray(mlx_data))
+
+        # From numpy arrays
+        batch_numpy = BatchFeature({"input_values": np.array([[1, 2], [3, 4]])}, tensor_type="mlx")
+        self.assertIsInstance(batch_numpy["input_values"], mx.array)
+
+        # List of same-shape MLX arrays should stack
+        mlx_arrays = [mx.array([[1, 2, 3], [4, 5, 6]]), mx.array([[7, 8, 9], [10, 11, 12]])]
+        batch_stacked = BatchFeature({"input_values": mlx_arrays}, tensor_type="mlx")
+        self.assertIsInstance(batch_stacked["input_values"], mx.array)
+        expected = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
+        np.testing.assert_array_equal(np.asarray(batch_stacked["input_values"]), expected)
+
+        # List of same-shape numpy arrays should stack
+        numpy_arrays = [np.random.randn(3, 10, 10) for _ in range(3)]
+        batch_stacked = BatchFeature({"pixel_values": numpy_arrays}, tensor_type="mlx")
+        self.assertIsInstance(batch_stacked["pixel_values"], mx.array)
+        self.assertEqual(batch_stacked["pixel_values"].shape, (3, 3, 10, 10))
+
     @require_torch
     def test_batch_feature_error_handling(self):
         """Test clear error messages for common conversion failures."""
diff --git a/tests/utils/test_import_structure.py b/tests/utils/test_import_structure.py
index fb48d35d5248..70b8f28eb2b9 100644
--- a/tests/utils/test_import_structure.py
+++ b/tests/utils/test_import_structure.py
@@ -192,6 +192,30 @@ def test_import_spread(self):
 
         self.assertEqual(ground_truth_spread_import_structure, newly_spread_import_structure)
 
+    def test_pil_import_structure_does_not_require_torchvision(self):
+        import_structure = spread_import_structure(define_import_structure(self.models_path / "gemma3"))
+
+        module_name = "image_processing_pil_gemma3"
+        object_name = "Gemma3ImageProcessorPil"
+        matching_backends = []
+
+        for backends, modules in import_structure.items():
+            if module_name in modules and object_name in modules[module_name]:
+                matching_backends.append(backends)
+
+        self.assertTrue(
+            matching_backends,
+            f"Could not find `{object_name}` in the import structure for `{module_name}`.",
+        )
+        self.assertTrue(
+            any("torchvision" not in backends for backends in matching_backends),
+            f"`{object_name}` should be importable without torchvision: {matching_backends}",
+        )
+        self.assertFalse(
+            any("torchvision" in backends for backends in matching_backends),
+            f"`{object_name}` should not require torchvision: {matching_backends}",
+        )
+
 
 @pytest.mark.parametrize(
     "backend,package_name,version_comparison,version",
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index fab48f9ddb8a..0a4b02dbc7bf 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -39,6 +39,7 @@
     AutoModel,
     AutoModelForImageClassification,
     AutoModelForSequenceClassification,
+    AutoTokenizer,
     BartConfig,
     BartForConditionalGeneration,
     BartModel,
@@ -218,6 +219,31 @@ def __init__(self, config):
         def forward(self, x):
             return self.linear_2(self.linear(x))
 
+    class DummyModelWithTiedEmbeddings(PreTrainedModel):
+        config_class = PreTrainedConfig
+        _tied_weights_keys = {"lm_head.weight": "embed_tokens.weight"}
+
+        def __init__(self, config):
+            super().__init__(config)
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+            self.post_init()
+
+        def get_input_embeddings(self):
+            return self.embed_tokens
+
+        def set_input_embeddings(self, value):
+            self.embed_tokens = value
+
+        def get_output_embeddings(self):
+            return self.lm_head
+
+        def set_output_embeddings(self, value):
+            self.lm_head = value
+
+        def forward(self, input_ids):
+            return self.lm_head(self.embed_tokens(input_ids))
+
     class ModelWithHead(PreTrainedModel):
         base_model_prefix = "base"
         config_class = PreTrainedConfig
@@ -364,6 +390,16 @@ def test_local_files_only(self):
                     TINY_IMAGE_CLASSIF, cache_dir=tmpdir, local_files_only=True
                 )
 
+        def test_offline_tokenizer(self):
+            with tempfile.TemporaryDirectory() as tmpdir:
+                # Populate cache
+                with patch("huggingface_hub.constants.HF_HUB_OFFLINE", False):
+                    snapshot_download(TINY_IMAGE_CLASSIF, cache_dir=tmpdir)
+
+                # Load tokenizer in offline mode - should work
+                with patch("huggingface_hub.constants.HF_HUB_OFFLINE", True):
+                    AutoTokenizer.from_pretrained(TINY_IMAGE_CLASSIF, cache_dir=tmpdir)
+
 
 # Need to be serializable, which means they cannot be in a test class method
 class TestGammaBetaNorm(torch.nn.Module):
@@ -414,6 +450,23 @@ def tearDown(self):
         torch.set_default_dtype(self.old_dtype)
         super().tearDown()
 
+    def _build_missing_tied_embeddings_checkpoint(self, tmp_dir):
+        reference_model = DummyModelWithTiedEmbeddings(
+            PreTrainedConfig(vocab_size=11, hidden_size=7, tie_word_embeddings=True)
+        )
+        reference_model.config.save_pretrained(tmp_dir)
+
+        state_dict = reference_model.state_dict()
+        del state_dict["lm_head.weight"]
+        safe_save_file(state_dict, os.path.join(tmp_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"})
+        return reference_model
+
+    def _assert_tied_embeddings_load_succeeded(self, model, reference_model):
+        self.assertIs(model.lm_head.weight, model.embed_tokens.weight, msg="Weights are not tied!")
+        for name, value in model.state_dict().items():
+            self.assertNotEqual(value.device.type, "meta", msg=f"{name} is still on meta!")
+        compare_state_dicts(reference_model.state_dict(), model.state_dict())
+
     @require_torch
     def test_get_total_byte_count_does_not_require_process_group(self):
         model = BaseModel(PreTrainedConfig())
@@ -1602,6 +1655,77 @@ def test_tied_weights_are_always_tied_from_config(self):
             model = LlamaForCausalLM._from_config(copy.deepcopy(config))
             self.assertTrue(model.lm_head.weight is not model.model.embed_tokens.weight)
 
+    def test_no_tie_weights_is_thread_local_during_concurrent_from_pretrained(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            reference_model = self._build_missing_tied_embeddings_checkpoint(tmp_dir)
+            first_loader_initialized = threading.Event()
+            release_first_loader = threading.Event()
+            first_loader_lock = threading.Lock()
+            results = []
+            errors = []
+            first_loader_claimed = False
+            original_init = DummyModelWithTiedEmbeddings.__init__
+
+            def instrumented_init(model_self, config):
+                original_init(model_self, config)
+
+                nonlocal first_loader_claimed
+                with first_loader_lock:
+                    should_block = not first_loader_claimed
+                    if should_block:
+                        first_loader_claimed = True
+
+                if should_block:
+                    first_loader_initialized.set()
+                    if not release_first_loader.wait(timeout=10):
+                        raise TimeoutError("Timed out waiting for the first loader to resume.")
+
+            def worker():
+                try:
+                    model, loading_info = DummyModelWithTiedEmbeddings.from_pretrained(
+                        tmp_dir, output_loading_info=True
+                    )
+                    results.append((model, loading_info))
+                except Exception as error:
+                    errors.append(error)
+
+            first_thread = threading.Thread(target=worker)
+            second_thread = threading.Thread(target=worker)
+
+            try:
+                with patch.object(DummyModelWithTiedEmbeddings, "__init__", new=instrumented_init):
+                    first_thread.start()
+                    self.assertTrue(first_loader_initialized.wait(timeout=10))
+
+                    second_thread.start()
+                    second_thread.join(timeout=20)
+                    self.assertFalse(second_thread.is_alive())
+            finally:
+                release_first_loader.set()
+                first_thread.join(timeout=20)
+                second_thread.join(timeout=20)
+
+            self.assertFalse(first_thread.is_alive())
+            self.assertFalse(second_thread.is_alive())
+            self.assertEqual(errors, [])
+            self.assertEqual(len(results), 2)
+
+            for model, loading_info in results:
+                self.assertSetEqual(loading_info["missing_keys"], set())
+                self._assert_tied_embeddings_load_succeeded(model, reference_model)
+
+    def test_no_tie_weights_is_model_specific_during_nested_from_pretrained(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            reference_model = self._build_missing_tied_embeddings_checkpoint(tmp_dir)
+
+            # `from_pretrained` uses its own no-tie scope while instantiating. An
+            # outer active scope must not suppress the final tie_weights() call.
+            with init.no_tie_weights():
+                model, load_info = DummyModelWithTiedEmbeddings.from_pretrained(tmp_dir, output_loading_info=True)
+
+                self.assertSetEqual(load_info["missing_keys"], set())
+                self._assert_tied_embeddings_load_succeeded(model, reference_model)
+
     def test_unexpected_keys_warnings(self):
         model = ModelWithHead(PreTrainedConfig(tie_word_embeddings=True))
         logger = logging.get_logger("transformers.modeling_utils")
diff --git a/tests/utils/test_offline.py b/tests/utils/test_offline.py
index 20f1690bb719..7083c3e314e2 100644
--- a/tests/utils/test_offline.py
+++ b/tests/utils/test_offline.py
@@ -1,11 +1,8 @@
 # Copyright 2020 The HuggingFace Team. All rights reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,27 +11,38 @@
 
 import subprocess
 import sys
-import unittest
 
-from transformers import BertConfig, BertModel, BertTokenizer, pipeline
+from transformers import AutoModel, BertConfig, BertModel, BertTokenizer, pipeline
 from transformers.testing_utils import TestCasePlus, require_torch
 
 
 class OfflineTests(TestCasePlus):
+    @classmethod
+    def setUpClass(cls):
+        # Cache warmup for all required models (run once per test class)
+        models = [
+            ("hf-internal-testing/tiny-random-bert", ["BertConfig", "BertModel", "BertTokenizer"]),
+            ("hf-internal-testing/tiny-random-bert-sharded", ["BertConfig", "BertModel"]),
+            ("hf-internal-testing/test_dynamic_model", ["AutoModel"]),
+        ]
+        for mname, components in models:
+            try:
+                if "BertConfig" in components:
+                    BertConfig.from_pretrained(mname)
+                if "BertModel" in components:
+                    BertModel.from_pretrained(mname)
+                if "BertTokenizer" in components:
+                    BertTokenizer.from_pretrained(mname)
+                if mname == "hf-internal-testing/tiny-random-bert":
+                    pipeline(task="fill-mask", model=mname)
+                if "AutoModel" in components:
+                    AutoModel.from_pretrained(mname, trust_remote_code=True)
+            except Exception as e:
+                print(f"Cache warmup failed for {mname}: {e}")
+
     @require_torch
-    @unittest.skip("This test is failing on main")  # TODO matt/ydshieh, this test needs to be fixed
     def test_offline_mode(self):
-        # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
-        # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
-        # while running an external program
-
-        # python one-liner segments
-
-        # this must be loaded before socket.socket is monkey-patched
-        load = """
-from transformers import BertConfig, BertModel, BertTokenizer, pipeline
-        """
-
+        load = """from transformers import BertConfig, BertModel, BertTokenizer, pipeline"""
         run = """
 mname = "hf-internal-testing/tiny-random-bert"
 BertConfig.from_pretrained(mname)
@@ -42,34 +50,18 @@ def test_offline_mode(self):
 BertTokenizer.from_pretrained(mname)
 pipe = pipeline(task="fill-mask", model=mname)
 print("success")
-        """
-
+"""
         mock = """
 import socket
 def offline_socket(*args, **kwargs): raise RuntimeError("Offline mode is enabled, we shouldn't access internet")
 socket.socket = offline_socket
-        """
-
-        # Force fetching the files so that we can use the cache
-        mname = "hf-internal-testing/tiny-random-bert"
-        BertConfig.from_pretrained(mname)
-        BertModel.from_pretrained(mname)
-        BertTokenizer.from_pretrained(mname)
-        pipeline(task="fill-mask", model=mname)
-
-        # baseline - just load from_pretrained with normal network
-        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
-        stdout, _ = self._execute_with_env(load, run, mock, TRANSFORMERS_OFFLINE="1")
+"""
+        stdout, _ = self._execute_with_env(load, run, mock, HF_HUB_OFFLINE="1")
         self.assertIn("success", stdout)
 
     @require_torch
     def test_offline_mode_no_internet(self):
-        # python one-liner segments
-        # this must be loaded before socket.socket is monkey-patched
-        load = """
-from transformers import BertConfig, BertModel, BertTokenizer, pipeline
-        """
-
+        load = """from transformers import BertConfig, BertModel, BertTokenizer, pipeline"""
         run = """
 mname = "hf-internal-testing/tiny-random-bert"
 BertConfig.from_pretrained(mname)
@@ -77,82 +69,48 @@ def test_offline_mode_no_internet(self):
 BertTokenizer.from_pretrained(mname)
 pipe = pipeline(task="fill-mask", model=mname)
 print("success")
-        """
-
+"""
         mock = """
 import socket
 def offline_socket(*args, **kwargs): raise socket.error("Faking flaky internet")
 socket.socket = offline_socket
-        """
-
-        # Force fetching the files so that we can use the cache
-        mname = "hf-internal-testing/tiny-random-bert"
-        BertConfig.from_pretrained(mname)
-        BertModel.from_pretrained(mname)
-        BertTokenizer.from_pretrained(mname)
-        pipeline(task="fill-mask", model=mname)
-
-        # baseline - just load from_pretrained with normal network
-        # should succeed
+"""
         stdout, _ = self._execute_with_env(load, run, mock)
         self.assertIn("success", stdout)
 
     @require_torch
     def test_offline_mode_sharded_checkpoint(self):
-        # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
-        # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
-        # while running an external program
-
-        # python one-liner segments
-
-        # this must be loaded before socket.socket is monkey-patched
-        load = """
-from transformers import BertConfig, BertModel, BertTokenizer
-        """
-
+        load = """from transformers import BertConfig, BertModel, BertTokenizer"""
         run = """
 mname = "hf-internal-testing/tiny-random-bert-sharded"
 BertConfig.from_pretrained(mname)
 BertModel.from_pretrained(mname)
 print("success")
-        """
-
+"""
         mock = """
 import socket
 def offline_socket(*args, **kwargs): raise ValueError("Offline mode is enabled")
 socket.socket = offline_socket
-        """
-
-        # baseline - just load from_pretrained with normal network
-        # should succeed
+"""
         stdout, _ = self._execute_with_env(load, run)
         self.assertIn("success", stdout)
 
-        # next emulate no network
-        # Doesn't fail anymore since the model is in the cache due to other tests, so commenting this.
-        # self._execute_with_env(load, mock, run, should_fail=True, TRANSFORMERS_OFFLINE="0")
-
-        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
-        stdout, _ = self._execute_with_env(load, mock, run, TRANSFORMERS_OFFLINE="1")
+        stdout, _ = self._execute_with_env(load, mock, run, HF_HUB_OFFLINE="1")
         self.assertIn("success", stdout)
 
     @require_torch
     def test_offline_mode_pipeline_exception(self):
-        load = """
-from transformers import pipeline
-        """
+        load = """from transformers import pipeline"""
         run = """
 mname = "hf-internal-testing/tiny-random-bert"
 pipe = pipeline(model=mname)
-        """
-
+"""
         mock = """
 import socket
 def offline_socket(*args, **kwargs): raise socket.error("Offline mode is enabled")
 socket.socket = offline_socket
-        """
-
-        _, stderr = self._execute_with_env(load, mock, run, should_fail=True, TRANSFORMERS_OFFLINE="1")
+"""
+        _, stderr = self._execute_with_env(load, mock, run, should_fail=True, HF_HUB_OFFLINE="1")
         self.assertIn(
             "You cannot infer task automatically within `pipeline` when using offline mode",
             stderr.replace("\n", ""),
@@ -160,29 +118,23 @@ def offline_socket(*args, **kwargs): raise socket.error("Offline mode is enabled
 
     @require_torch
     def test_offline_model_dynamic_model(self):
-        load = """
-from transformers import AutoModel
-        """
+        load = """from transformers import AutoModel"""
         run = """
 mname = "hf-internal-testing/test_dynamic_model"
 AutoModel.from_pretrained(mname, trust_remote_code=True)
 print("success")
-        """
-
-        # baseline - just load from_pretrained with normal network
-        # should succeed
+"""
         stdout, _ = self._execute_with_env(load, run)
         self.assertIn("success", stdout)
 
-        # should succeed as TRANSFORMERS_OFFLINE=1 tells it to use local files
-        stdout, _ = self._execute_with_env(load, run, TRANSFORMERS_OFFLINE="1")
+        stdout, _ = self._execute_with_env(load, run, HF_HUB_OFFLINE="1")
         self.assertIn("success", stdout)
 
     def test_is_offline_mode(self):
         """
         Test `is_offline_mode` helper (should respect both HF_HUB_OFFLINE and legacy TRANSFORMERS_OFFLINE env vars)
         """
-        load = "from huggingface_hub import is_offline_mode"
+        load = "from transformers.utils import is_offline_mode"
         run = "print(is_offline_mode())"
 
         stdout, _ = self._execute_with_env(load, run)
@@ -195,26 +147,12 @@ def test_is_offline_mode(self):
         self.assertIn("True", stdout)
 
     def _execute_with_env(self, *commands: tuple[str, ...], should_fail: bool = False, **env) -> tuple[str, str]:
-        """Execute Python code with a given environment and return the stdout/stderr as strings.
-
-        If `should_fail=True`, the command is expected to fail. Otherwise, it should succeed.
-        Environment variables can be passed as keyword arguments.
-        """
-        # Build command
         cmd = [sys.executable, "-c", "\n".join(commands)]
-
-        # Configure env
         new_env = self.get_env()
         new_env.update(env)
-
-        # Run command
         result = subprocess.run(cmd, env=new_env, check=False, capture_output=True)
-
-        # Check execution
         if should_fail:
             self.assertNotEqual(result.returncode, 0, result.stderr)
         else:
             self.assertEqual(result.returncode, 0, result.stderr)
-
-        # Return output
         return result.stdout.decode(), result.stderr.decode()
diff --git a/tests/utils/test_testing_utils.py b/tests/utils/test_testing_utils.py
new file mode 100644
index 000000000000..40385332e57e
--- /dev/null
+++ b/tests/utils/test_testing_utils.py
@@ -0,0 +1,114 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.util
+import os
+import sys
+import tempfile
+import types
+import unittest
+from pathlib import Path
+from unittest import mock
+
+from transformers.testing_utils import (
+    _clear_patched_testing_methods_output_files,
+    _get_patched_testing_methods_output_path,
+)
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def _load_notification_service_module():
+    module_path = REPO_ROOT / "utils" / "notification_service.py"
+    spec = importlib.util.spec_from_file_location("notification_service_for_tests", module_path)
+    module = importlib.util.module_from_spec(spec)
+    stub_modules = {
+        "compare_test_runs": types.SimpleNamespace(compare_job_sets=lambda *args, **kwargs: None),
+        "get_ci_error_statistics": types.SimpleNamespace(get_jobs=lambda *args, **kwargs: []),
+        "get_previous_daily_ci": types.SimpleNamespace(
+            get_last_daily_ci_reports=lambda *args, **kwargs: None,
+            get_last_daily_ci_run=lambda *args, **kwargs: None,
+            get_last_daily_ci_workflow_run_id=lambda *args, **kwargs: None,
+        ),
+        "huggingface_hub": types.SimpleNamespace(HfApi=object),
+        "slack_sdk": types.SimpleNamespace(WebClient=object),
+    }
+    with mock.patch.dict(sys.modules, stub_modules):
+        spec.loader.exec_module(module)
+    return module
+
+
+class PatchedTestingMethodsOutputPathTester(unittest.TestCase):
+    @mock.patch.dict(os.environ, {"_PATCHED_TESTING_METHODS_OUTPUT_DIR": "/tmp/reports"}, clear=True)
+    def test_output_path_keeps_legacy_name_without_xdist(self):
+        self.assertEqual(_get_patched_testing_methods_output_path(), Path("/tmp/reports/captured_info.txt"))
+
+    @mock.patch.dict(
+        os.environ,
+        {"_PATCHED_TESTING_METHODS_OUTPUT_DIR": "/tmp/reports", "PYTEST_XDIST_WORKER": "gw1"},
+        clear=True,
+    )
+    def test_output_path_is_worker_specific_with_xdist(self):
+        self.assertEqual(_get_patched_testing_methods_output_path(), Path("/tmp/reports/captured_info_gw1.txt"))
+
+    def test_clear_output_files_removes_all_matching_files_without_xdist(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_path = Path(tmp_dir)
+            (tmp_path / "captured_info.txt").write_text("legacy info")
+            (tmp_path / "captured_info_gw0.txt").write_text("gw0 info")
+            (tmp_path / "summary_short.txt").write_text("FAILED test_example\n")
+
+            with mock.patch.dict(os.environ, {"_PATCHED_TESTING_METHODS_OUTPUT_DIR": tmp_dir}, clear=True):
+                _clear_patched_testing_methods_output_files()
+
+            self.assertFalse((tmp_path / "captured_info.txt").exists())
+            self.assertFalse((tmp_path / "captured_info_gw0.txt").exists())
+            self.assertTrue((tmp_path / "summary_short.txt").exists())
+
+
+class RetrieveArtifactTester(unittest.TestCase):
+    def test_retrieve_artifact_merges_worker_specific_captured_info_files(self):
+        notification_service = _load_notification_service_module()
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_path = Path(tmp_dir)
+            (tmp_path / "captured_info_gw1.txt").write_text("gw1 info")
+            (tmp_path / "captured_info_gw0.txt").write_text("gw0 info")
+            (tmp_path / "summary_short.txt").write_text("FAILED test_example\n")
+
+            artifact = notification_service.retrieve_artifact(str(tmp_path), gpu="multi")
+
+        self.assertEqual(artifact["summary_short"], "FAILED test_example\n")
+        self.assertIn("captured_info_gw0.txt", artifact["captured_info"])
+        self.assertIn("gw0 info", artifact["captured_info"])
+        self.assertIn("captured_info_gw1.txt", artifact["captured_info"])
+        self.assertIn("gw1 info", artifact["captured_info"])
+        self.assertNotIn("captured_info_gw0", artifact)
+        self.assertNotIn("captured_info_gw1", artifact)
+
+    def test_retrieve_artifact_preserves_legacy_captured_info_file(self):
+        notification_service = _load_notification_service_module()
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_path = Path(tmp_dir)
+            (tmp_path / "captured_info.txt").write_text("legacy info")
+
+            artifact = notification_service.retrieve_artifact(str(tmp_path), gpu=None)
+
+        self.assertEqual(artifact["captured_info"], "legacy info")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 6738341892e1..15862f088f09 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -935,16 +935,33 @@ def retrieve_artifact(artifact_path: str, gpu: str | None):
         raise ValueError(f"Invalid GPU for artifact. Passed GPU: `{gpu}`.")
 
     _artifact = {}
+    captured_info = []
 
     if os.path.exists(artifact_path):
-        files = os.listdir(artifact_path)
+        files = sorted(os.listdir(artifact_path))
         for file in files:
             try:
                 with open(os.path.join(artifact_path, file)) as f:
-                    _artifact[file.split(".")[0]] = f.read()
+                    content = f.read()
             except UnicodeDecodeError as e:
                 raise ValueError(f"Could not open {os.path.join(artifact_path, file)}.") from e
 
+            artifact_name = file.split(".")[0]
+            if artifact_name == "captured_info" or artifact_name.startswith("captured_info_"):
+                captured_info.append((file, content))
+                continue
+
+            _artifact[artifact_name] = content
+
+    if captured_info:
+        if len(captured_info) == 1 and captured_info[0][0] == "captured_info.txt":
+            _artifact["captured_info"] = captured_info[0][1]
+        else:
+            separator = f"\n\n{'=' * 120}\n\n"
+            _artifact["captured_info"] = separator.join(
+                f"{file}\n{'-' * len(file)}\n{content}" for file, content in captured_info
+            )
+
     return _artifact