evalstate · evalstate · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml
@@ -186,7 +186,18 @@ jobs:
         env:
           report_name_prefix: ${{ inputs.report_name_prefix }}
         run: |
-          cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/captured_info.txt"
+          shopt -s nullglob
+          captured_info_files=("/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"/captured_info*.txt)
+
+          if [ ${#captured_info_files[@]} -eq 0 ]; then
+            echo "No captured information files found."
+            exit 0
+          fi
+
+          for captured_info_file in "${captured_info_files[@]}"; do
+            echo "===== ${captured_info_file##*/} ====="
+            cat "$captured_info_file"
+          done
 
       - name: Copy test_outputs.txt
         if: ${{ always() }}

diff --git a/all_requirements.txt b/all_requirements.txt
@@ -0,0 +1,98 @@
+gpustat==1.1.1
+psutil==6.0.0
+psycopg2==2.9.9
+pandas>=1.5.0
+numpy>=1.21.0
+psutil>=5.8.0
+nvidia-ml-py>=12.0.0
+torch>=2.0.0
+datasets>=2.10.0
+huggingface_hub>=0.16.0
+amdsmi>=7.0.2
+git+https://github.com/huggingface/transformers.git@main # install main or adjust it with vX.X.X for installing version specific transforms
+datasets==1.8.0accelerate >= 0.12.0
+datasets >= 1.8.0
+torch >= 1.3.0
+evaluateaccelerate >= 0.21.0
+sentencepiece != 0.1.92
+protobuf
+torch >= 1.3
+datasets[audio]>=1.14.0
+evaluate
+librosa
+torchaudio
+torch>=1.6
+accelerate >= 0.12.0
+datasets >= 1.8.0
+sentencepiece != 0.1.92
+protobuf
+sacrebleu >= 1.4.12
+py7zr
+torch >= 1.3
+evaluatedatasets >= 2.0.0
+torch >= 1.3
+accelerate
+evaluate
+Pillow
+albumentations >= 1.4.16
+accelerate >= 0.12.0
+datasets >= 1.8.0
+sentencepiece != 0.1.92
+protobuf
+rouge-score
+nltk
+py7zr
+torch >= 1.3
+evaluate
+torch>=1.5.0
+torchvision>=0.6.0
+datasets>=1.8.0accelerate >= 0.12.0
+datasets >= 1.8.0
+sentencepiece != 0.1.92
+scipy
+scikit-learn
+protobuf
+torch >= 1.3
+evaluateaccelerate>=0.12.0
+torch>=1.5.0
+torchvision>=0.6.0
+datasets>=2.14.0
+evaluate
+scikit-learnaccelerate >= 0.12.0
+torch >= 1.3
+datasets >= 2.14.0
+sentencepiece != 0.1.92
+protobuf
+evaluate
+scikit-learn
+accelerate >= 0.12.0
+seqeval
+datasets >= 1.8.0
+torch >= 1.3
+evaluatealbumentations >= 1.4.16
+timm
+datasets>=4.0
+torchmetrics
+pycocotools
+datasets[audio] >= 1.18.0
+torch >= 1.5
+torchaudio
+librosa
+jiwer
+evaluate
+datasets[audio] >= 1.12.0
+torch >= 1.5
+torchaudio
+accelerate >= 0.12.0
+librosatorch>=1.5.0
+torchvision>=0.6.0
+datasets>=1.8.0albumentations >= 1.4.16
+timm
+datasets
+torchmetrics
+pycocotools
+accelerate >= 0.12.0
+sentencepiece != 0.1.92
+protobuf
+torch >= 1.3
+evaluate
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
@@ -18,9 +18,20 @@ ARG TORCHCODEC='0.11.0'
 
 ARG FLASH_ATTN='false'
 
+# 'x86_64' or 'arm64'
+ARG ARCHITECTURE='x86_64'
+
 RUN apt update
-RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs curl
 RUN git lfs install
+
+RUN set-e; \
+if [ "$ARCHITECTURE" = "arm64" ]; then \
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y;\
+    PATH="/root/.cargo/bin:${PATH}";\
+    rustc --version;\
+fi;
+
 RUN python3 -m pip install --no-cache-dir --upgrade pip
 
 ARG REF=main
@@ -36,7 +47,11 @@ RUN set -e; \
     # Determine torch version
     if [ ${#PYTORCH} -gt 0 ] && [ "$PYTORCH" != "pre" ]; then \
         VERSION="torch==${PYTORCH}.*"; \
-        TORCHCODEC_VERSION="torchcodec==${TORCHCODEC}.*"; \
+        if [ "$ARCHITECTURE" = "arm64" ]; then \
+            TORCHCODEC_VERSION="torchcodec"; \
+        else \
+            TORCHCODEC_VERSION="torchcodec==${TORCHCODEC}.*"; \
+        fi; \
     else \
         VERSION="torch"; \
         TORCHCODEC_VERSION="torchcodec"; \

diff --git a/docs/source/en/auto_docstring.md b/docs/source/en/auto_docstring.md
@@ -134,11 +134,11 @@ class MyModelConfig(PreTrainedConfig):
         Description of another model-specific parameter.
 
     ```python
-    >>> from transformers import MyModelConfig, MyModel
+    from transformers import MyModelConfig, MyModel
 
-    >>> configuration = MyModelConfig()
-    >>> model = MyModel(configuration)
-    >>> configuration = model.config
+    configuration = MyModelConfig()
+    model = MyModel(configuration)
+    configuration = model.config
     ```
     """
 

diff --git a/docs/source/en/internal/import_utils.md b/docs/source/en/internal/import_utils.md
@@ -29,18 +29,24 @@ This object is still importable:
 
 ```python
 >>> from transformers import DetrImageProcessor
->>> print(DetrImageProcessor)
-<class 'DetrImageProcessor'>
+>>> print(DetrImageProcessor)  # doctest: +ELLIPSIS
+<class '...DetrImageProcessor'>
 ```
 
 However, no method can be called on that object:
 
 ```python
+>>> from transformers.utils.import_utils import BACKENDS_MAPPING, DummyObject
+>>> _torchvision_backend = BACKENDS_MAPPING["torchvision"]
+>>> BACKENDS_MAPPING["torchvision"] = (lambda: False, _torchvision_backend[1].lstrip("\n"))
+>>> DetrImageProcessor = DummyObject("DetrImageProcessor", (), {"_backends": ["torchvision"]})
 >>> DetrImageProcessor.from_pretrained()
-ImportError:
-DetrImageProcessor requires the Torchvision library but it was not found in your environment. Check out the instructions on the
+Traceback (most recent call last):
+...
+ImportError: DetrImageProcessor requires the Torchvision library but it was not found in your environment. Check out the instructions on the
 installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
 Please note that you may need to restart your runtime after installation.
+>>> BACKENDS_MAPPING["torchvision"] = _torchvision_backend
 ```
 
 Let's see how to specify specific object dependencies.

diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md
@@ -34,6 +34,7 @@ pipeline but can provide additional quality of life.
 Simple call on one item:
 
 ```python
+>>> from transformers import pipeline
 >>> pipe = pipeline("text-classification")
 >>> pipe("This restaurant is awesome")
 [{'label': 'POSITIVE', 'score': 0.9998743534088135}]

diff --git a/docs/source/en/model_doc/pe_audio_video.md b/docs/source/en/model_doc/pe_audio_video.md
@@ -26,7 +26,47 @@ TODO
 ### Basic usage
 
 ```py
-TODO
+
+model = PeAudioVideoModel.from_pretrained("facebook/pe-av-large", device_map="cuda", dtype=torch.bfloat16)
+processor = PeAudioVideoProcessor.from_pretrained("facebook/pe-av-large")
+
+from huggingface_hub import hf_hub_download
+
+video_path = hf_hub_download(
+    repo_id="eustlb/dummy-video-dataset", filename="audiobox.mp4", repo_type="dataset"
+)
+
+video_path2 = hf_hub_download(
+    repo_id="eustlb/dummy-video-dataset", filename="glass_breaking.mp4", repo_type="dataset"
+)
+
+audio_path = hf_hub_download(
+    repo_id="eustlb/dummy-video-dataset", filename="audiobox.mp4", repo_type="dataset"
+)
+
+audio_path2 = hf_hub_download(
+    repo_id="eustlb/dummy-video-dataset", filename="glass_breaking.mp4", repo_type="dataset"
+)
+
+video_files = [video_path, video_path2]
+descriptions = ["A woman and a man speaking", "A glass breaking"]
+audio_files = [audio_path, audio_path2]
+
+inputs = processor(
+    videos=video_files, text=descriptions, audio=audio_files, return_tensors="pt", padding=True
+)
+
+with torch.inference_mode(), torch.autocast(model.device.type, dtype=torch.bfloat16):
+    outputs = model(**inputs.to(model.device, dtype=model.dtype))
+
+audio_embeds = outputs.audio_embeds  # Audio-only embeddings
+video_embeds = outputs.video_embeds  # Video-only embeddings
+audio_video_embeds = outputs.audio_video_embeds  # Joint audio-video embeddings
+text_audio_embeds = outputs.text_audio_embeds  # Text embeddings aligned to audio
+text_video_embeds = outputs.text_video_embeds  # Text embeddings aligned to video
+text_audio_video_embeds = outputs.text_audio_video_embeds  # Text embeddings aligned to audio-video
+audio_plus_text_embeds = outputs.audio_plus_text_embeds  # Joint audio and text embedding
+video_plus_text_embeds = outputs.video_plus_text_embeds  # Joint video and text embedding
 ```
 
 ## PeAudioVideoProcessor

diff --git a/docs/source/en/model_doc/qwen3_5.md b/docs/source/en/model_doc/qwen3_5.md
@@ -70,14 +70,19 @@ TODO
 [[autodoc]] Qwen3_5ForCausalLM
     - forward
 
+## Qwen3_5ForConditionalGeneration
+
+[[autodoc]] Qwen3_5ForConditionalGeneration
+    - forward
+
 ## Qwen3_5ForSequenceClassification
 
 [[autodoc]] Qwen3_5ForSequenceClassification
     - forward
 
-## Qwen3_5ForConditionalGeneration
+## Qwen3_5TextForSequenceClassification
 
-[[autodoc]] Qwen3_5ForConditionalGeneration
+[[autodoc]] Qwen3_5TextForSequenceClassification
     - forward
 
 ## Qwen3_5Tokenizer

diff --git a/docs/source/en/tasks/zero_shot_object_detection.md b/docs/source/en/tasks/zero_shot_object_detection.md
@@ -168,8 +168,7 @@ boxes have the correct coordinates relative to the original image:
 ...     outputs = model(**inputs)
 
 >>> results = processor.post_process_grounded_object_detection(
-...    outputs, threshold=0.50, target_sizes=[(image.height, image.width)], text_labels=text_labels,
-... )[0]
+...    outputs, threshold=0.50, target_sizes=[(image.height, image.width)], text_labels=text_labels)[0]
 
 >>> draw = ImageDraw.Draw(image)
 

diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py
@@ -160,10 +160,8 @@ def create_causal_mask_mapping(
     # from `forward` call. If users run a `forward` call, we have no option to infer `is_first_iteration` because users may be
     # running generation with custom loop. Thus we need to infer it in a `non-perfect` way
     # NOTE: Determining prefill in that case requires checking data values, which is not compile-compatible.
-    is_first_iteration = (
-        is_first_iteration
-        if is_first_iteration
-        else (past_key_values is None or not past_key_values.is_initialized or pixel_values is not None)
+    is_first_iteration = is_first_iteration or (
+        past_key_values is None or not past_key_values.is_initialized or pixel_values is not None
     )
 
     if is_first_iteration or not kwargs.get("use_cache", True):
@@ -256,9 +254,9 @@ def get_placeholder_mask(
 
         n_image_tokens = special_image_mask.sum()
         n_image_features = image_features.shape[0] * image_features.shape[1]
-        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        special_image_mask = special_image_mask.unsqueeze(-1).to(inputs_embeds.device)
         torch_compilable_check(
-            inputs_embeds[special_image_mask].numel() == image_features.numel(),
+            n_image_tokens * inputs_embeds.shape[-1] == image_features.numel(),
             f"Image features and image tokens do not match, tokens: {n_image_tokens}, features: {n_image_features}",
         )
         return special_image_mask

diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -633,7 +633,7 @@ def preprocess_images(examples):
     )
 
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.TPU:
+    if accelerator.distributed_type == DistributedType.XLA:
         model.tie_weights()
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.

diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -553,7 +553,7 @@ def group_texts(examples):
     )
 
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.TPU:
+    if accelerator.distributed_type == DistributedType.XLA:
         model.tie_weights()
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
@@ -627,6 +627,7 @@ def group_texts(examples):
         model.train()
         if args.with_tracking:
             total_loss = 0
+            total_samples = 0
         if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
             # We skip the first `n` batches in the dataloader when resuming from a checkpoint
             active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
@@ -638,7 +639,9 @@ def group_texts(examples):
                 loss = outputs.loss
                 # We keep track of the loss at each epoch
                 if args.with_tracking:
-                    total_loss += loss.detach().float()
+                    batch_size = batch["input_ids"].shape[0]
+                    total_loss += loss.detach().float() * batch_size
+                    total_samples += batch_size
                 accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
@@ -665,7 +668,8 @@ def group_texts(examples):
                 outputs = model(**batch)
 
             loss = outputs.loss
-            losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
+            batch_size = batch["input_ids"].shape[0]
+            losses.append(accelerator.gather_for_metrics(loss.repeat(batch_size)))
 
         losses = torch.cat(losses)
         try:
@@ -681,7 +685,7 @@ def group_texts(examples):
                 {
                     "perplexity": perplexity,
                     "eval_loss": eval_loss,
-                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "train_loss": total_loss.item() / total_samples,
                     "epoch": epoch,
                     "step": completed_steps,
                 },