From 8af3af13abb85ca60e795d0390832f398a56c34f Mon Sep 17 00:00:00 2001
From: Manuel de Prada Corral <manueldeprada@gmail.com>
Date: Tue, 26 Aug 2025 11:55:45 +0200
Subject: [PATCH 1/5] Squashed commit remove-constrastive-search

---
 docs/source/en/generation_strategies.md       |  23 -
 docs/source/ja/generation_strategies.md       |  23 -
 docs/source/ko/generation_strategies.md       |  23 +-
 .../run_generation_contrastive_search.py      | 146 -----
 src/transformers/cache_utils.py               |   6 +-
 .../generation/configuration_utils.py         |  21 +-
 src/transformers/generation/utils.py          | 501 +-----------------
 tests/generation/test_configuration_utils.py  |   1 +
 tests/generation/test_utils.py                | 315 +----------
 tests/models/bart/test_modeling_bart.py       |  11 +-
 tests/models/csm/test_modeling_csm.py         |  15 -
 tests/models/gemma/test_modeling_gemma.py     |   1 +
 tests/models/gpt2/test_modeling_gpt2.py       |  10 +-
 .../gpt_bigcode/test_modeling_gpt_bigcode.py  |   8 -
 tests/models/gptj/test_modeling_gptj.py       |  10 +-
 tests/models/idefics/test_modeling_idefics.py |  12 -
 .../models/idefics2/test_modeling_idefics2.py |  12 -
 .../models/idefics3/test_modeling_idefics3.py |  12 -
 .../kosmos2_5/test_modeling_kosmos2_5.py      |   8 -
 tests/models/lfm2/test_modeling_lfm2.py       |  12 -
 tests/models/llama/test_modeling_llama.py     |   9 +-
 tests/models/mistral/test_modeling_mistral.py |   1 +
 tests/models/opt/test_modeling_opt.py         |  10 +-
 .../paligemma2/test_modeling_paligemma2.py    |   4 -
 tests/models/smolvlm/test_modeling_smolvlm.py |  12 -
 tests/models/t5/test_modeling_t5.py           |  10 +-
 26 files changed, 107 insertions(+), 1109 deletions(-)
 delete mode 100755 examples/pytorch/text-generation/run_generation_contrastive_search.py

diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 37c90ee43fa5..5c7d27192292 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -225,29 +225,6 @@ outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=to
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
-
-### Contrastive search
-
-[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
-
-Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
-
-device = infer_device()
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.float16).to(device)
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
-```
-
 ### Diverse beam search
 
 [Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
diff --git a/docs/source/ja/generation_strategies.md b/docs/source/ja/generation_strategies.md
index a93ef3d36440..856c4856c52f 100644
--- a/docs/source/ja/generation_strategies.md
+++ b/docs/source/ja/generation_strategies.md
@@ -168,29 +168,6 @@ An increasing sequence: one, two, three, four, five, six, seven, eight, nine, te
 ['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n']
 ```
 
-### Contrastive search
-
-コントラスティブ検索デコーディング戦略は、2022年の論文[A Contrastive Framework for Neural Text Generation](https://huggingface.co/papers/2202.06417)で提案されました。
-これは、非反復的でありながら一貫性のある長い出力を生成するために優れた結果を示しています。コントラスティブ検索の動作原理を学ぶには、[このブログポスト](https://huggingface.co/blog/introducing-csearch)をご覧ください。
-コントラスティブ検索の動作を有効にし、制御する2つの主要なパラメータは「penalty_alpha」と「top_k」です：
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
->>> checkpoint = "openai-community/gpt2-large"
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
-
->>> prompt = "Hugging Face Company is"
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> outputs = model.generate(**inputs, penalty_alpha=0.6, top_k=4, max_new_tokens=100)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Hugging Face Company is a family owned and operated business. We pride ourselves on being the best
-in the business and our customer service is second to none.\n\nIf you have any questions about our
-products or services, feel free to contact us at any time. We look forward to hearing from you!']
-```
-
 ### Multinomial sampling
 
 常に最高確率のトークンを次のトークンとして選択する貪欲検索とは異なり、多項分布サンプリング（または祖先サンプリングとも呼ばれます）はモデルによって提供される語彙全体の確率分布に基づいて次のトークンをランダムに選択します。ゼロ以外の確率を持つすべてのトークンには選択される可能性があり、これにより繰り返しのリスクが減少します。
diff --git a/docs/source/ko/generation_strategies.md b/docs/source/ko/generation_strategies.md
index f45fea5b2280..da38e4f418f2 100644
--- a/docs/source/ko/generation_strategies.md
+++ b/docs/source/ko/generation_strategies.md
@@ -68,7 +68,7 @@ GenerationConfig {
 - `max_new_tokens`: 생성할 최대 토큰 수입니다. 즉, 프롬프트에 있는 토큰을 제외한 출력 시퀀스의 크기입니다. 출력의 길이를 중단 기준으로 사용하는 대신, 전체 생성물이 일정 시간을 초과할 때 생성을 중단하기로 선택할 수도 있습니다. 더 알아보려면 [`StoppingCriteria`]를 확인하세요.
 - `num_beams`: 1보다 큰 수의 빔을 지정함으로써, 탐욕 탐색(greedy search)에서 빔 탐색(beam search)으로 전환하게 됩니다. 이 전략은 각 시간 단계에서 여러 가설을 평가하고 결국 전체 시퀀스에 대해 가장 높은 확률을 가진 가설을 선택합니다. 이는 초기 토큰의 확률이 낮아 탐욕 탐색에 의해 무시되었을 높은 확률의 시퀀스를 식별할 수 있는 장점을 가집니다.
 - `do_sample`: 이 매개변수를 `True`로 설정하면, 다항 샘플링, 빔 탐색 다항 샘플링, Top-K 샘플링 및 Top-p 샘플링과 같은 디코딩 전략을 활성화합니다. 이러한 전략들은 전체 어휘에 대한 확률 분포에서 다음 토큰을 선택하며, 전략별로 특정 조정이 적용됩니다.
-- `num_return_sequences`: 각 입력에 대해 반환할 시퀀스 후보의 수입니다. 이 옵션은 빔 탐색(beam search)의 변형과 샘플링과 같이 여러 시퀀스 후보를 지원하는 디코딩 전략에만 사용할 수 있습니다. 탐욕 탐색(greedy search)과 대조 탐색(contrastive search) 같은 디코딩 전략은 단일 출력 시퀀스를 반환합니다.
+- `num_return_sequences`: 각 입력에 대해 반환할 시퀀스 후보의 수입니다. 이 옵션은 빔 탐색(beam search)의 변형과 샘플링과 같이 여러 시퀀스 후보를 지원하는 디코딩 전략에만 사용할 수 있습니다. 탐욕 탐색(greedy search) 같은 디코딩 전략은 단일 출력 시퀀스를 반환합니다.
 
 ## 모델에 사용자 정의 디코딩 전략 저장[[save-a-custom-decoding-strategy-with-your-model]]
 
@@ -165,27 +165,6 @@ An increasing sequence: one, two, three, four, five, six, seven, eight, nine, te
 ['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n']
 ```
 
-### 대조 탐색(Contrastive search)[[contrastive-search]]
-
-2022년 논문 [A Contrastive Framework for Neural Text Generation](https://huggingface.co/papers/2202.06417)에서 제안된 대조 탐색 디코딩 전략은 반복되지 않으면서도 일관된 긴 출력을 생성하는 데 있어 우수한 결과를 보였습니다. 대조 탐색이 작동하는 방식을 알아보려면 [이 블로그 포스트](https://huggingface.co/blog/introducing-csearch)를 확인하세요. 대조 탐색의 동작을 가능하게 하고 제어하는 두 가지 주요 매개변수는 `penalty_alpha`와 `top_k`입니다:
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
->>> checkpoint = "openai-community/gpt2-large"
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
-
->>> prompt = "Hugging Face Company is"
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> outputs = model.generate(**inputs, penalty_alpha=0.6, top_k=4, max_new_tokens=100)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Hugging Face Company is a family owned and operated business. We pride ourselves on being the best
-in the business and our customer service is second to none.\n\nIf you have any questions about our
-products or services, feel free to contact us at any time. We look forward to hearing from you!']
-```
-
 ### 다항 샘플링(Multinomial sampling)[[multinomial-sampling]]
 
 탐욕 탐색(greedy search)이 항상 가장 높은 확률을 가진 토큰을 다음 토큰으로 선택하는 것과 달리, 다항 샘플링(multinomial sampling, 조상 샘플링(ancestral sampling)이라고도 함)은 모델이 제공하는 전체 어휘에 대한 확률 분포를 기반으로 다음 토큰을 무작위로 선택합니다. 0이 아닌 확률을 가진 모든 토큰은 선택될 기회가 있으므로, 반복의 위험을 줄일 수 있습니다.
diff --git a/examples/pytorch/text-generation/run_generation_contrastive_search.py b/examples/pytorch/text-generation/run_generation_contrastive_search.py
deleted file mode 100755
index 879229c062e3..000000000000
--- a/examples/pytorch/text-generation/run_generation_contrastive_search.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2022 University of Cambridge, Tencent AI Lab, DeepMind and The University of Hong Kong Authors and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# /// script
-# dependencies = [
-#     "transformers @ git+https://github.com/huggingface/transformers.git",
-#     "accelerate >= 0.21.0",
-#     "sentencepiece != 0.1.92",
-#     "protobuf",
-#     "torch >= 1.3",
-# ]
-# ///
-
-"""The examples of running contrastive search on the auto-APIs;
-
-Running this example:
-python run_generation_contrastive_search.py --model_name_or_path=openai-community/gpt2-large --penalty_alpha=0.6 --k=4 --length=256
-"""
-
-import argparse
-import logging
-
-from accelerate import PartialState
-from accelerate.utils import set_seed
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.INFO,
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-    )
-    parser.add_argument("--prompt", type=str, default="")
-    parser.add_argument("--length", type=int, default=20)
-    parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=1.0,
-        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
-    )
-    parser.add_argument(
-        "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
-    )
-    parser.add_argument("--k", type=int, default=0)
-    parser.add_argument("--penalty_alpha", type=float, default=0.0)
-    parser.add_argument("--p", type=float, default=0.9)
-
-    parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.")
-    parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
-    parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
-
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument(
-        "--use_cpu",
-        action="store_true",
-        help="Whether or not to use cpu. If set to False, we will use gpu/npu or mps device if available",
-    )
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    args = parser.parse_args()
-
-    # Initialize the distributed state.
-    distributed_state = PartialState(cpu=args.use_cpu)
-
-    logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
-
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Initialize the model and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-    model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)
-
-    # tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
-    # model = OPTForCausalLM.from_pretrained(args.model_name_or_path)
-    # Set the model to the right device
-    model.to(distributed_state.device)
-
-    if args.fp16:
-        model.half()
-
-    logger.info(args)
-    prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
-
-    inputs = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False)
-    inputs = {key: value.to(distributed_state.device) for key, value in inputs.items()}
-
-    output_sequences = model.generate(
-        **inputs,
-        max_length=args.length + len(inputs["input_ids"][0]),
-        penalty_alpha=args.penalty_alpha,
-        top_k=args.k,
-    )
-
-    generated_sequences = []
-    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
-        print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
-        generated_sequence = generated_sequence.tolist()
-
-        # Decode text
-        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, add_special_tokens=False)
-
-        # Remove all text after the stop token
-        text = text[: text.find(args.stop_token) if args.stop_token else None]
-
-        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
-        total_sequence = (
-            prompt_text + text[len(tokenizer.decode(inputs["input_ids"][0], clean_up_tokenization_spaces=True)) :]
-        )
-
-        generated_sequences.append(total_sequence)
-        print(total_sequence)
-
-    return generated_sequences
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index d1e1b441f67f..e1e38fc76da6 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -1358,7 +1358,7 @@ def check_dynamic_cache(self, method: str):
     def crop(self, maximum_length: int):
         """
         Crop the past key values up to a new `maximum_length` in terms of tokens. `maximum_length` can also be
-        negative to remove `maximum_length` tokens. This is used in assisted decoding and contrastive search.
+        negative to remove `maximum_length` tokens. This is used in assisted decoding and contrastive search (on the Hub).
         """
         self.check_dynamic_cache(self.crop.__name__)
         self.self_attention_cache.crop(maximum_length)
@@ -1378,13 +1378,13 @@ def batch_split(self, full_batch_size: int, split_size: int) -> "list[EncoderDec
         return out
 
     def batch_repeat_interleave(self, repeats: int):
-        """Repeat the cache `repeats` times in the batch dimension. Used in contrastive search."""
+        """Repeat the cache `repeats` times in the batch dimension. Used in contrastive search (on the Hub)."""
         self.check_dynamic_cache(self.batch_repeat_interleave.__name__)
         self.self_attention_cache.batch_repeat_interleave(repeats)
         self.cross_attention_cache.batch_repeat_interleave(repeats)
 
     def batch_select_indices(self, indices: torch.Tensor):
-        """Only keep the `indices` in the batch dimension of the cache. Used in contrastive search."""
+        """Only keep the `indices` in the batch dimension of the cache. Used in contrastive search (on the Hub)."""
         self.check_dynamic_cache(self.batch_select_indices.__name__)
         self.self_attention_cache.batch_select_indices(indices)
         self.cross_attention_cache.batch_select_indices(indices)
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 82332b9e7809..1edaf19948e8 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -44,7 +44,7 @@
 logger = logging.get_logger(__name__)
 METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash", "transformers_version")
 STATIC_CACHE_IMPLEMENTATIONS = ("static", "offloaded_static")
-DYNAMIC_CACHE_IMPLEMENTATIONS = ("dynamic", "offloaded", "quantized")
+DYNAMIC_CACHE_IMPLEMENTATIONS = ("dynamic", "dynamic_full", "offloaded", "quantized")
 # All the following are redundant and deprecated, but kept for BC
 DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS = (
     "sliding_window",
@@ -86,7 +86,6 @@ class GenerationConfig(PushToHubMixin):
     for text-decoder, text-to-text, speech-to-text, and vision-to-text models:
 
         - *greedy decoding* if `num_beams=1` and `do_sample=False`
-        - *contrastive search* if `penalty_alpha>0.` and `top_k>1`
         - *multinomial sampling* if `num_beams=1` and `do_sample=True`
         - *beam-search decoding* if `num_beams>1` and `do_sample=False`
         - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
@@ -138,8 +137,6 @@ class GenerationConfig(PushToHubMixin):
         num_beam_groups (`int`, *optional*, defaults to 1):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
             [this paper](https://huggingface.co/papers/1610.02424) for more details.
-        penalty_alpha (`float`, *optional*):
-            The values balance the model confidence and the degeneration penalty in contrastive search decoding.
 
         > Parameters that control the cache
 
@@ -255,9 +252,6 @@ class GenerationConfig(PushToHubMixin):
             The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
             Higher guidance scale encourages the model to generate samples that are more closely linked to the input
             prompt, usually at the expense of poorer quality.
-        low_memory (`bool`, *optional*):
-            Switch to sequential beam search and sequential topk for contrastive search to reduce peak memory.
-            Used with beam search and contrastive search.
         watermarking_config (`BaseWatermarkingConfig` or `dict`, *optional*):
             Arguments used to watermark the model outputs by adding a small bias to randomly selected set of "green"
             tokens. See the docs of [`SynthIDTextWatermarkingConfig`] and [`WatermarkingConfig`] for more
@@ -366,8 +360,6 @@ def __init__(self, **kwargs):
         self.do_sample = kwargs.pop("do_sample", False)
         self.num_beams = kwargs.pop("num_beams", 1)
         self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
-        self.penalty_alpha = kwargs.pop("penalty_alpha", None)
-        self.dola_layers = kwargs.pop("dola_layers", None)
 
         # Parameters that control the cache
         self.use_cache = kwargs.pop("use_cache", True)
@@ -403,7 +395,7 @@ def __init__(self, **kwargs):
         self.sequence_bias = kwargs.pop("sequence_bias", None)
         self.token_healing = kwargs.pop("token_healing", False)
         self.guidance_scale = kwargs.pop("guidance_scale", None)
-        self.low_memory = kwargs.pop("low_memory", None)
+
         watermarking_config = kwargs.pop("watermarking_config", None)
         if watermarking_config is None:
             self.watermarking_config = None
@@ -445,6 +437,11 @@ def __init__(self, **kwargs):
         self.compile_config = kwargs.pop("compile_config", None)
         self.disable_compile = kwargs.pop("disable_compile", False)
 
+        # Deprecated (moved to the Hub). TODO joao, manuel: remove in v4.62.0
+        self.low_memory = kwargs.pop("low_memory", None)
+        self.penalty_alpha = kwargs.pop("penalty_alpha", None)
+        self.dola_layers = kwargs.pop("dola_layers", None)
+
         # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
         # interface.
         self._from_model_config = kwargs.pop("_from_model_config", False)
@@ -610,9 +607,7 @@ def validate(self, strict=False):
                 minor_issues["typical_p"] = greedy_wrong_parameter_msg.format(
                     flag_name="typical_p", flag_value=self.typical_p
                 )
-            if (
-                self.top_k is not None and self.top_k != 50 and self.penalty_alpha is None
-            ):  # contrastive search uses top_k
+            if self.top_k is not None and self.top_k != 50:
                 minor_issues["top_k"] = greedy_wrong_parameter_msg.format(flag_name="top_k", flag_value=self.top_k)
             if self.epsilon_cutoff is not None and self.epsilon_cutoff != 0.0:
                 minor_issues["epsilon_cutoff"] = greedy_wrong_parameter_msg.format(
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 3e387c0808d6..e03ad600deb3 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -43,7 +43,6 @@
 from ..integrations.deepspeed import is_deepspeed_zero3_enabled
 from ..integrations.fsdp import is_fsdp_managed_module
 from ..masking_utils import create_masks_for_generate
-from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 from ..pytorch_utils import isin_mps_friendly
 from ..tokenization_utils import ExtensionsTrie
 from ..utils import (
@@ -369,7 +368,6 @@ class GenerationMixin(ContinuousMixin):
 
     The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
         - *greedy decoding* if `num_beams=1` and `do_sample=False`
-        - *contrastive search* if `penalty_alpha>0` and `top_k>1`
         - *multinomial sampling* if `num_beams=1` and `do_sample=True`
         - *beam-search decoding* if `num_beams>1` and `do_sample=False`
         - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
@@ -1946,15 +1944,17 @@ def _prepare_cache_for_generation(
             )
             generation_config.cache_implementation = None
 
-        # assisted decoding and contrastive search need to roll-back the Cache, which is not supported if
-        # it has sliding layers - so if we use any of those 2, do not pass the config to DynamicCache, which
-        # will result in creating a Cache with only full layers even if model uses sliding window
+        # Assisted decoding and contrastive search require cache rollback, which is incompatible with sliding layers.
+        # To handle this, we skip passing the model config to DynamicCache (forcing a full-layer cache).
+        # The "dynamic_full" option is a shortcut for generate() users to avoid sliding layers on their own.
         generation_mode = generation_config.get_generation_mode(assistant_model)
-        dynamic_cache_kwargs = (
-            {"config": self.config}
-            if generation_mode not in (GenerationMode.ASSISTED_GENERATION, GenerationMode.CONTRASTIVE_SEARCH)
-            else {}
-        )
+        if (
+            generation_mode in (GenerationMode.ASSISTED_GENERATION, GenerationMode.CONTRASTIVE_SEARCH)
+            or generation_config.cache_implementation == "dynamic_full"
+        ):
+            dynamic_cache_kwargs = {}
+        else:
+            dynamic_cache_kwargs = {"config": self.config}
         if generation_config.cache_implementation is not None:
             if generation_config.cache_implementation in ALL_STATIC_CACHE_IMPLEMENTATIONS:
                 if generation_config.cache_implementation in DEPRECATED_STATIC_CACHE_IMPLEMENTATIONS:
@@ -1995,7 +1995,7 @@ def _prepare_cache_for_generation(
                 model_kwargs[cache_name] = QuantizedCache(backend=backend, **cache_config)
             elif generation_config.cache_implementation == "offloaded":
                 model_kwargs[cache_name] = DynamicCache(**dynamic_cache_kwargs, offloading=True)
-            elif generation_config.cache_implementation == "dynamic":
+            elif "dynamic" in generation_config.cache_implementation:
                 model_kwargs[cache_name] = DynamicCache(**dynamic_cache_kwargs)
 
         # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
@@ -2512,29 +2512,26 @@ def generate(
                 trust_remote_code=trust_remote_code,
                 **kwargs,
             )
-
+        # TODO joao, manuel: remove this in v4.62.0
         elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
+            logger.warning_once(
+                "Contrastive search was moved to a `custom_generate` repo: https://hf.co/transformers-community/contrastive-search. "
+                "To prevent loss of backward compatibility, add `custom_generate='transformers-community/contrastive-search'` "
+                "to your `generate` call before v4.62.0."
+            )
             if not trust_remote_code:
                 logger.warning_once(
-                    "Contrastive Search is scheduled to be moved to a `custom_generate` repository in v4.55.0. "
-                    "To prevent loss of backward compatibility, add `trust_remote_code=True` to your `generate` call."
+                    "Contrastive search requires `trust_remote_code=True` in your `generate` call, since "
+                    "it loads https://hf.co/transformers-community/contrastive-search."
                 )
-            if not model_kwargs["use_cache"]:
-                raise ValueError("Contrastive search requires `use_cache=True`")
-            if self._is_stateful:
-                # Just like assisted generation, we need to be able to rollback to a previous state (see comment above)
-                raise ValueError(
-                    f"contrastive search is not supported with stateful models, such as {self.__class__.__name__}"
-                )
-
-            result = self._contrastive_search(
-                input_ids,
-                logits_processor=prepared_logits_processor,
-                stopping_criteria=prepared_stopping_criteria,
+            # Avoid calling the model-defined `generate` method, since some models (e.g. Janus, Whisper) override it.
+            return GenerationMixin.generate(
+                self,
+                inputs,
+                custom_generate="transformers-community/contrastive-search",
                 generation_config=generation_config,
-                synced_gpus=synced_gpus,
-                streamer=streamer,
-                **model_kwargs,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
             )
 
         elif generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
@@ -2765,421 +2762,6 @@ def heal_tokens(
 
         return input_ids
 
-    @torch.no_grad()
-    def _contrastive_search(
-        self,
-        input_ids: torch.LongTensor,
-        logits_processor: LogitsProcessorList,
-        stopping_criteria: StoppingCriteriaList,
-        generation_config: GenerationConfig,
-        synced_gpus: bool,
-        streamer: Optional["BaseStreamer"],
-        **model_kwargs,
-    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
-        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            logits_processor (`LogitsProcessorList`):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            generation_config ([`~generation.GenerationConfig`]):
-                The generation configuration to be used as parametrization of the decoding method.
-            synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
-                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
-            streamer (`BaseStreamer`, *optional*):
-                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
-                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
-            model_kwargs:
-                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
-                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`]
-            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-        """
-        # init values
-        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
-        top_k = generation_config.top_k
-        penalty_alpha = generation_config.penalty_alpha
-        pad_token_id = generation_config._pad_token_tensor
-        output_attentions = generation_config.output_attentions
-        output_hidden_states = generation_config.output_hidden_states
-        output_scores = generation_config.output_scores
-        output_logits = generation_config.output_logits
-        return_dict_in_generate = generation_config.return_dict_in_generate
-        sequential = generation_config.low_memory
-
-        # init attention / hidden states / scores tuples
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        scores = () if (return_dict_in_generate and output_scores) else None
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-        if return_dict_in_generate and self.config.is_encoder_decoder:
-            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-            encoder_hidden_states = (
-                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-            )
-
-        # keep track of which sequences are already finished
-        batch_size, cur_len = input_ids.shape[:2]
-        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
-        model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
-
-        # Create cosine_matrix_mask based on the attention_mask
-        cosine_matrix_mask = torch.ones_like(input_ids, dtype=torch.long)
-        if self.config.is_encoder_decoder:
-            if "decoder_attention_mask" in model_kwargs and model_kwargs["decoder_attention_mask"] is not None:
-                cosine_matrix_mask = model_kwargs["decoder_attention_mask"]
-        else:
-            cosine_matrix_mask = model_kwargs["attention_mask"]
-        cosine_matrix_mask = cosine_matrix_mask.repeat_interleave(top_k, dim=0)
-
-        this_peer_finished = False
-
-        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values;
-            # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step
-            if model_kwargs.get("past_key_values") is None or (
-                isinstance(model_kwargs["past_key_values"], (Cache, EncoderDecoderCache))
-                and model_kwargs["past_key_values"].get_seq_length() == 0
-            ):
-                # prepare inputs
-                model_kwargs["use_cache"] = True
-                model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-                # encode the given prefix and prepare model inputs; encoder-decoder model process the prefix and save
-                # the `encoder_outputs`
-                outputs = self(
-                    **model_inputs, return_dict=True, output_hidden_states=True, output_attentions=output_attentions
-                )
-
-                # last decoder hidden states will be used to compute the degeneration penalty (cosine similarity with
-                # previous tokens)
-                if self.config.is_encoder_decoder:
-                    last_hidden_states = outputs.decoder_hidden_states[-1]
-                else:
-                    last_hidden_states = outputs.hidden_states[-1]
-
-                # next logit for contrastive search to select top-k candidate tokens
-                # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for this first iteration
-                # (the clone itself is always small)
-                # torch.float32 is needed to retain precision for later logits manipulations
-                logit_for_next_step = outputs.logits[:, -1, :].to(
-                    copy=True, dtype=torch.float32, device=input_ids.device
-                )
-
-                model_kwargs = self._update_model_kwargs_for_generation(
-                    outputs,
-                    model_kwargs,
-                    is_encoder_decoder=self.config.is_encoder_decoder,
-                )
-
-                if not sequential:
-                    # Expands model inputs top_k times, for batched forward passes (akin to beam search).
-                    # input_ids is required for expanding visual inputs in qwen2vl
-                    _, model_kwargs = self._expand_inputs_for_generation(
-                        input_ids=input_ids,
-                        expand_size=top_k,
-                        is_encoder_decoder=self.config.is_encoder_decoder,
-                        **model_kwargs,
-                    )
-
-                past_key_values = model_kwargs.get("past_key_values")
-                if past_key_values is None:
-                    raise ValueError(
-                        f"{self.__class__.__name__} does not support caching and therefore **can't** be used "
-                        "for contrastive search."
-                    )
-                elif (
-                    not isinstance(past_key_values[0], (tuple, torch.Tensor))
-                    or past_key_values[0][0].shape[0] != batch_size
-                ):
-                    raise ValueError(
-                        f"{self.__class__.__name__} does not have a standard cache format and therefore **can't** be "
-                        "used for contrastive search without further modifications."
-                    )
-
-            # contrastive_search main logic start:
-            # contrastive search decoding consists of two steps: (1) candidate tokens recall; (2) candidate re-rank by
-            # degeneration penalty
-            processed_logit_for_next_step = logits_processor(input_ids, logit_for_next_step)
-            next_probs = nn.functional.softmax(processed_logit_for_next_step, dim=-1)
-
-            top_k_probs, top_k_ids = torch.topk(next_probs, dim=-1, k=top_k)
-
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_logits:
-                    raw_logits += (logit_for_next_step,)
-                if output_scores:
-                    scores += (processed_logit_for_next_step,)
-                if output_attentions:
-                    decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                    )
-                    if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
-
-                if output_hidden_states:
-                    decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
-                        if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
-                    )
-
-            # This is needed to properly delete outputs.logits which may be very large for this first iteration
-            # Otherwise a reference to outputs.logits is kept all along until after the next call to self.forward()
-            del outputs
-
-            if not sequential:
-                # Replicates the new past_key_values to match the `top_k` candidates
-                past = model_kwargs["past_key_values"]
-                # If it is a static cache, modify it in-place layer after layer to save memory
-                if isinstance(past, DynamicCache) or (
-                    isinstance(past, EncoderDecoderCache) and isinstance(past.self_attention_cache, DynamicCache)
-                ):
-                    past.batch_repeat_interleave(top_k)
-                else:
-                    new_key_values = []
-                    for layer in past:
-                        items = []
-                        # item is either the key or the value matrix
-                        for item in layer:
-                            items.append(item.repeat_interleave(top_k, dim=0))
-                        new_key_values.append(tuple(items))
-
-                    past = tuple(new_key_values)
-
-                model_kwargs["past_key_values"] = past
-
-            if sequential:
-                all_outputs = []
-                for i in range(top_k):
-                    # compute the candidate tokens by the language model and collect their hidden_states
-                    next_model_inputs = self.prepare_inputs_for_generation(top_k_ids[:, i].view(-1, 1), **model_kwargs)
-
-                    outputs = self(
-                        **next_model_inputs,
-                        return_dict=True,
-                        output_hidden_states=True,
-                        output_attentions=output_attentions,
-                    )
-                    if isinstance(outputs["past_key_values"], DynamicCache) or (
-                        isinstance(outputs["past_key_values"], EncoderDecoderCache)
-                        and isinstance(outputs["past_key_values"].self_attention_cache, DynamicCache)
-                    ):
-                        # Remove past K-V from output since we don't need to stack later
-                        outputs["past_key_values"] = None
-                        # Remove last token from past K-V since we don't want to append it at this point
-                        model_kwargs["past_key_values"].crop(-1)
-
-                    all_outputs.append(outputs)
-                outputs = stack_model_outputs(all_outputs, self.config.get_text_config())
-
-            else:
-                # compute the candidate tokens by the language model and collect their hidden_states
-                # assembles top_k_ids into batch of size k
-                next_model_inputs = self.prepare_inputs_for_generation(top_k_ids.view(-1, 1), **model_kwargs)
-
-                outputs = self(
-                    **next_model_inputs,
-                    return_dict=True,
-                    output_hidden_states=True,
-                    output_attentions=output_attentions,
-                )
-
-            # This is essential to avoid having a last reference to the big past K-V and double the necessary memory
-            # in the next loop
-            del next_model_inputs
-
-            # name is different for encoder-decoder and decoder-only models
-            if self.config.is_encoder_decoder:
-                next_hidden = outputs.decoder_hidden_states[-1]
-                full_hidden_states = outputs.decoder_hidden_states
-            else:
-                next_hidden = outputs.hidden_states[-1]
-                full_hidden_states = outputs.hidden_states
-
-            # .float() is needed to retain precision for later logits manipulations
-            logits = outputs.logits[:, -1, :].float()
-            context_hidden = last_hidden_states.repeat_interleave(top_k, dim=0)
-
-            # compute the degeneration penalty and re-rank the candidates based on the degeneration penalty and the
-            # model confidence. Keeping `selected_idx` on CPU enables multi-device contrastive search and doesn't
-            # introduce (noticeable) slowdowns on single-device runs.
-            selected_idx = _ranking_fast(
-                context_hidden, next_hidden, top_k_probs, cosine_matrix_mask, penalty_alpha, top_k
-            )
-            cosine_matrix_mask = torch.cat(
-                [cosine_matrix_mask, cosine_matrix_mask.new_ones((cosine_matrix_mask.shape[0], 1))], dim=-1
-            )
-            selected_idx = selected_idx.to("cpu")
-
-            # This will be used instead of the previous inneficient torch.stack(torch.split())
-            augmented_idx = torch.tensor([x + i * top_k for i, x in enumerate(selected_idx)])
-
-            # prepare for the next step: (1) next token_id; (2) past_key_values; (3) last_hidden_states for computing
-            # the degeneration penalty; (4) logits for selecting next top-k candidates; (5) selected tokens scores
-            # (model confidence minus degeneration penalty); (6) decoder hidden_states
-            next_tokens = top_k_ids[range(len(top_k_ids)), selected_idx]
-            next_hidden = torch.stack(torch.split(next_hidden.squeeze(dim=1), top_k))
-            next_hidden = next_hidden[range(batch_size), selected_idx, :]
-            last_hidden_states = torch.cat([last_hidden_states, next_hidden.unsqueeze(1)], dim=1)
-
-            next_decoder_hidden_states = ()
-            for layer in full_hidden_states:
-                layer = torch.stack(torch.split(layer, top_k))[range(batch_size), selected_idx, :]
-                next_decoder_hidden_states += (layer,)
-
-            # generate past_key_values cache of only the selected token
-            if sequential:
-                next_model_input = self.prepare_inputs_for_generation(
-                    top_k_ids[:, selected_idx].view(-1, 1), **model_kwargs
-                )
-
-                selected_outputs = self(
-                    **next_model_input,
-                    return_dict=True,
-                    output_hidden_states=False,
-                    output_attentions=False,
-                )
-                next_past_key_values = selected_outputs["past_key_values"]
-
-            else:
-                next_past_key_values = None
-                for possible_cache_name in ALL_CACHE_NAMES:
-                    next_past_key_values = next_past_key_values or getattr(outputs, possible_cache_name, None)
-                # Do it in-place layer per layer to save memory
-                if isinstance(next_past_key_values, DynamicCache) or (
-                    isinstance(next_past_key_values, EncoderDecoderCache)
-                    and isinstance(next_past_key_values.self_attention_cache, DynamicCache)
-                ):
-                    next_past_key_values.batch_select_indices(augmented_idx)
-                else:
-                    new_key_values = []
-                    for layer in next_past_key_values:
-                        items = []
-                        # item is either the key or the value matrix
-                        for item in layer:
-                            items.append(item[augmented_idx, ...])
-                        new_key_values.append(tuple(items))
-
-                    next_past_key_values = tuple(new_key_values)
-
-            logit_for_next_step = torch.stack(torch.split(logits, top_k))[range(batch_size), selected_idx, :]
-            logit_for_next_step = logit_for_next_step.to(input_ids.device)
-
-            # Rebuilds the relevant parts of the model output for the selected token, for use in the next iteration
-            if self.config.is_encoder_decoder:
-                next_step_cross_attentions = ()
-                next_step_decoder_attentions = ()
-                if output_attentions:
-                    for layer in outputs.cross_attentions:
-                        layer = torch.stack(torch.split(layer, top_k, dim=0))[range(batch_size), selected_idx, ...]
-                        next_step_cross_attentions += (layer,)
-                    for layer in outputs.decoder_attentions:
-                        layer = torch.stack(torch.split(layer, top_k, dim=0))[range(batch_size), selected_idx, ...]
-                        next_step_decoder_attentions += (layer,)
-                outputs = Seq2SeqLMOutput(
-                    past_key_values=next_past_key_values,
-                    decoder_hidden_states=next_decoder_hidden_states,
-                    decoder_attentions=next_step_decoder_attentions or None,
-                    cross_attentions=next_step_cross_attentions or None,
-                )
-            else:
-                next_step_attentions = ()
-                if output_attentions:
-                    for layer in outputs.attentions:
-                        layer = torch.stack(torch.split(layer, top_k, dim=0))[range(batch_size), selected_idx, ...]
-                        next_step_attentions += (layer,)
-                outputs = CausalLMOutputWithPast(
-                    past_key_values=next_past_key_values,
-                    hidden_states=next_decoder_hidden_states,
-                    attentions=next_step_attentions or None,
-                )
-            # contrastive_search main logic end
-
-            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs,
-                model_kwargs,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-            if synced_gpus and this_peer_finished:
-                continue
-
-            # finished sentences should have their next token be a padding token
-            if has_eos_stopping_criteria:
-                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            if streamer is not None:
-                streamer.put(next_tokens.cpu())
-
-            # stop when each sentence is finished
-            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
-            this_peer_finished = unfinished_sequences.max() == 0
-
-        if streamer is not None:
-            streamer.end()
-
-        if return_dict_in_generate:
-            # Contrastive search works by forward looking at the next token, so we need to exclude it from
-            # `past_key_values` to be consistent with the other decoding methods
-            if model_kwargs.get("past_key_values") is not None:
-                if isinstance(model_kwargs["past_key_values"], DynamicCache) or (
-                    isinstance(model_kwargs["past_key_values"], EncoderDecoderCache)
-                    and isinstance(model_kwargs["past_key_values"].self_attention_cache, DynamicCache)
-                ):
-                    model_kwargs["past_key_values"].crop(-1)
-                else:
-                    past_key_values = []
-                    for layer in model_kwargs["past_key_values"]:
-                        layer_past_key_values = []
-                        for item in layer:
-                            layer_past_key_values.append(item[..., :-1, :])
-                        past_key_values.append(tuple(layer_past_key_values))
-                    model_kwargs["past_key_values"] = tuple(past_key_values)
-
-            if self.config.is_encoder_decoder:
-                return GenerateEncoderDecoderOutput(
-                    sequences=input_ids,
-                    scores=scores,
-                    logits=raw_logits,
-                    encoder_attentions=encoder_attentions,
-                    encoder_hidden_states=encoder_hidden_states,
-                    decoder_attentions=decoder_attentions,
-                    cross_attentions=cross_attentions,
-                    decoder_hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-            else:
-                return GenerateDecoderOnlyOutput(
-                    sequences=input_ids,
-                    scores=scores,
-                    logits=raw_logits,
-                    attentions=decoder_attentions,
-                    hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-        else:
-            return input_ids
-
     def _sample(
         self,
         input_ids: torch.LongTensor,
@@ -4873,37 +4455,6 @@ def _split_model_outputs(outputs, new_outputs, cur_len, added_len, is_decoder_at
     return outputs
 
 
-def _ranking_fast(
-    context_hidden: torch.FloatTensor,
-    next_hidden: torch.FloatTensor,
-    next_top_k_probs: torch.FloatTensor,
-    cosine_matrix_mask: torch.LongTensor,
-    alpha: float,
-    beam_width: int,
-) -> torch.FloatTensor:
-    """
-    Reranks the top_k candidates based on a degeneration penalty (cosine similarity with previous tokens), as described
-    in the paper "A Contrastive Framework for Neural Text Generation". Returns the index of the best candidate for each
-    row in the batch.
-    """
-    norm_context_hidden = context_hidden / context_hidden.norm(dim=2, keepdim=True)
-    norm_next_hidden = next_hidden / next_hidden.norm(dim=2, keepdim=True)
-    cosine_matrix = torch.matmul(norm_context_hidden, norm_next_hidden.transpose(1, 2)).squeeze(-1)  # [B*K, S]
-
-    # Penalize cosine_matrix based on the cosine_matrix_mask (ignore padding positions)
-    # Using a large negative value for masked positions
-    cosine_matrix_mask = cosine_matrix_mask.to(dtype=cosine_matrix.dtype)
-    cosine_matrix_mask = (1 - cosine_matrix_mask) * torch.finfo(cosine_matrix.dtype).min
-    cosine_matrix = cosine_matrix + cosine_matrix_mask
-
-    degeneration_penalty, _ = torch.max(cosine_matrix, dim=-1)  # [B*K]
-    next_top_k_probs = next_top_k_probs.view(-1)  # [B*K]
-    contrastive_score = (1.0 - alpha) * next_top_k_probs - alpha * degeneration_penalty
-    contrastive_score = torch.stack(torch.split(contrastive_score, beam_width))  # [B, K]
-    _, selected_idx = contrastive_score.max(dim=-1)  # [B]
-    return selected_idx
-
-
 def stack_model_outputs(model_outputs: list[ModelOutput], config: PretrainedConfig) -> ModelOutput:
     """
     Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index 765019f2e5f2..429d61cbd26a 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -289,6 +289,7 @@ def test_generation_mode(self):
         config = GenerationConfig(num_beams=2)
         self.assertEqual(config.get_generation_mode(), GenerationMode.BEAM_SEARCH)
 
+        # TODO joao, manuel: remove this in v4.62.0
         config = GenerationConfig(top_k=10, do_sample=False, penalty_alpha=0.6)
         self.assertEqual(config.get_generation_mode(), GenerationMode.CONTRASTIVE_SEARCH)
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 92d770ff10d2..449d8122c12b 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -62,7 +62,6 @@
 
 if is_torch_available():
     import torch
-    import torch.nn.functional as F
 
     from transformers import (
         AutoModelForCausalLM,
@@ -76,7 +75,6 @@
         GPT2Tokenizer,
         ImageGPTForCausalImageModeling,
         SpeechEncoderDecoderModel,
-        T5ForConditionalGeneration,
     )
     from transformers.cache_utils import (
         Cache,
@@ -415,41 +413,6 @@ def _constrained_beam_search_generate(
 
         return output_generate
 
-    def _contrastive_generate(
-        self,
-        model,
-        inputs_dict,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        contrastive_search_kwargs = {
-            "penalty_alpha": 0.6,
-            "top_k": 5,
-        }
-
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        output_generate = model.generate(
-            do_sample=False,
-            num_beams=1,
-            max_new_tokens=self.max_new_tokens,
-            min_new_tokens=self.max_new_tokens,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **logits_processor_kwargs,
-            **contrastive_search_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
     @pytest.mark.generate
     def test_greedy_generate(self):
         for model_class in self.all_generative_model_classes:
@@ -964,108 +927,6 @@ def test_constrained_beam_search_generate_dict_output(self):
                 num_beams=beam_kwargs["num_beams"],
             )
 
-    @pytest.mark.generate
-    def test_contrastive_generate(self):
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support contrastive search generation")
-
-            # won't fix: FSMT and Reformer have a different cache variable type (and format).
-            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-
-            # NOTE: contrastive search only works with cache on at the moment.
-            if not hasattr(config.get_text_config(), "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-            config.is_decoder = True
-
-            # test old generation output for backwards compatibility
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._contrastive_generate(
-                model=model,
-                inputs_dict=inputs_dict,
-                use_cache=True,  # Enable cache
-            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
-
-    @pytest.mark.generate
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support contrastive search generation")
-
-            # won't fix: FSMT and Reformer have a different cache variable type (and format).
-            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-
-            # NOTE: contrastive search only works with cache on at the moment.
-            if not hasattr(config.get_text_config(), "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-            config.is_decoder = True
-            if self.has_attentions:
-                config._attn_implementation = "eager"  # can't output attentions otherwise
-
-            model = model_class(config).to(torch_device).eval()
-            output_generate = self._contrastive_generate(
-                model=model,
-                inputs_dict=inputs_dict,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=True,  # Enable cache
-            )
-
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(
-                    output_generate.sequences.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1]
-                )
-
-            self._check_generate_outputs(output_generate, model.config, use_cache=True)
-
-    @pytest.mark.generate
-    def test_contrastive_generate_low_memory(self):
-        # Check that choosing 'low_memory' does not change the model output
-        for model_class in self.all_generative_model_classes:
-            if model_class._is_stateful:
-                self.skipTest(reason="Stateful models don't support contrastive search generation")
-
-            if any(model_name in model_class.__name__.lower() for model_name in ["reformer"]):
-                self.skipTest(reason="Won't fix: old model with different cache format")
-
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-
-            # NOTE: contrastive search only works with cache on at the moment.
-            if not hasattr(config.get_text_config(), "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-
-            config.is_decoder = True
-
-            # test output equality of low versus high memory
-            model = model_class(config).to(torch_device).eval()
-            generate_kwargs = {
-                "top_k": 4,
-                "penalty_alpha": 0.6,
-                "max_new_tokens": self.max_new_tokens,
-                "use_cache": True,
-                "return_dict_in_generate": True,
-                "output_scores": True,
-            }
-
-            low_output = model.generate(**inputs_dict, **generate_kwargs, low_memory=True)
-            high_output = model.generate(**inputs_dict, **generate_kwargs, low_memory=False)
-            self.assertTrue(has_similar_generate_outputs(low_output, high_output))
-
     @parameterized.expand([("random",), ("same",)])
     @pytest.mark.generate
     def test_assisted_decoding_matches_greedy_search(self, assistant_type):
@@ -3443,31 +3304,6 @@ def test_decoder_start_id_from_config(self):
         with self.assertRaises(ValueError):
             outputs = bart_model.generate(input_ids, generation_config=GenerationConfig(do_sample=False))
 
-    def test_contrastive_search_batched(self):
-        # Tests that contrastive search works with batched inputs (i.e. has the same output as for non-batched inputs)
-        articles = ["Foo", "Bar Baz"]
-        tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
-        model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(torch_device)
-
-        model.config.eos_token_id = None
-        input_ids_batched = tokenizer(articles, padding=True, return_tensors="pt").input_ids.to(torch_device)
-        input_ids = tokenizer(articles[1], return_tensors="pt").input_ids.to(torch_device)
-
-        output_sequences_batched = model.generate(
-            input_ids=input_ids_batched, penalty_alpha=0.6, top_k=4, return_dict_in_generate=True, output_scores=True
-        )
-        output_sequences = model.generate(
-            input_ids=input_ids, penalty_alpha=0.6, top_k=4, return_dict_in_generate=True, output_scores=True
-        )
-
-        batched_out = tokenizer.decode(output_sequences_batched.sequences[1], skip_special_tokens=True)
-        out = tokenizer.decode(output_sequences.sequences[0], skip_special_tokens=True)
-        self.assertEqual(batched_out, out)
-
-        # output_sequences_batched.scores[0][1] -> 1st set of logits, 2nd sequence
-        max_score_diff = (output_sequences_batched.scores[0][1] - output_sequences.scores[0][0]).abs().max()
-        self.assertTrue(max_score_diff < 1e-5)
-
     def test_logits_processor_not_inplace(self):
         article = "Today a dragon flew over Paris."
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device)
@@ -4052,139 +3888,6 @@ def test_init_static_cache_multi_accelerator(self):
         values_1 = results.past_key_values.layers[1].values
         self.assertTrue(keys_1.device == values_1.device == torch.device(1))
 
-    @slow
-    def test_padding_input_contrastive_search_gpt2(self):
-        # Load the pre-trained GPT-2 model and tokenizer
-        model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
-        model.to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", clean_up_tokenization_spaces=True)
-
-        # Set the tokenizer to left-pad the sequences
-        tokenizer.padding_side = "left"
-
-        # Define the PAD token as the EOS token
-        tokenizer.pad_token = tokenizer.eos_token
-        model.generation_config.pad_token_id = model.generation_config.eos_token_id
-
-        # Define the input prompt
-        prompt_text = "The whispered legends of the haunted mansion spoke"
-
-        # Tokenize the input prompt
-        encoded_prompt = tokenizer(prompt_text, return_tensors="pt", padding=True)
-        input_ids = encoded_prompt.input_ids.to(torch_device)
-        attention_mask = encoded_prompt.attention_mask.to(torch_device)
-
-        # Define the contrastive search params
-        penalty_alpha = 0.6
-        top_k = 4
-
-        # Define the padding length to add to the input IDs and attention mask
-        padding_length = 10
-
-        # Generate text without padding
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            do_sample=False,
-            penalty_alpha=penalty_alpha,
-            top_k=top_k,
-            max_new_tokens=64,
-        )
-        generated_text_no_padding = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-        # Pad the input IDs and attention mask on the left
-        padded_input_ids = F.pad(
-            input_ids, (padding_length, 0), "constant", value=model.generation_config.pad_token_id
-        )
-        padded_attention_mask = F.pad(attention_mask, (padding_length, 0), "constant", value=0)
-
-        # Generate text with padded inputs
-        outputs_with_padding = model.generate(
-            input_ids=padded_input_ids,
-            attention_mask=padded_attention_mask,
-            do_sample=False,
-            penalty_alpha=penalty_alpha,
-            top_k=top_k,
-            max_new_tokens=64,
-        )
-        generated_text_with_padding = tokenizer.decode(outputs_with_padding[0], skip_special_tokens=True)
-
-        # Assert that the generated texts are identical for padded and non-padded inputs
-        self.assertEqual(generated_text_no_padding, generated_text_with_padding)
-        self.assertEqual(
-            generated_text_with_padding,
-            'The whispered legends of the haunted mansion spoke of the "souls of the dead" who were "falling '
-            'out of the sky" and "falling into the sea."\n\nThe ghostly apparitions were said to have been '
-            'created by the spirits of the dead, who were "falling out of the sky" and "falling into the sea',
-        )
-
-    @slow
-    def test_padding_input_contrastive_search_t5(self):
-        # Load the pre-trained T5 model and tokenizer
-        model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
-        model.to(torch_device)
-        tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", clean_up_tokenization_spaces=True)
-
-        # Define the input prompt
-        prompt_text = "translate English to German: I need to finish this task before the end of the day."
-
-        # Tokenize the input prompt
-        encoded_prompt = tokenizer(prompt_text, return_tensors="pt")
-        input_ids = encoded_prompt.input_ids.to(torch_device)
-        attention_mask = encoded_prompt.attention_mask.to(torch_device)
-
-        # Define the decoder prompt
-        decoder_prompt_text = "Ich muss diese Aufgabe"
-        encoded_decoder_prompt = tokenizer(decoder_prompt_text, add_special_tokens=False, return_tensors="pt")
-        decoder_input_ids = encoded_decoder_prompt.input_ids.to(torch_device)
-        decoder_attention_mask = encoded_decoder_prompt.attention_mask.to(torch_device)
-
-        # Define the contrastive search params
-        penalty_alpha = 0.6
-        top_k = 4
-
-        # Generate text without padding
-        outputs = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            do_sample=False,
-            penalty_alpha=penalty_alpha,
-            top_k=top_k,
-            max_new_tokens=64,
-        )
-        generated_text_no_padding = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-        # Define the padding length to add to the input IDs and attention mask
-        padding_length = 10
-
-        # Pad the decoder input IDs and attention mask on the left
-        padded_decoder_input_ids = F.pad(
-            decoder_input_ids, (padding_length, 0), "constant", value=model.generation_config.pad_token_id
-        )
-        padded_decoder_attention_mask = F.pad(decoder_attention_mask, (padding_length, 0), "constant", value=0)
-        # Since the decoder_start_token_id is the same as the pad_token_id,
-        # the last padded token represents the decoder start token.
-        # Set the attention mask for the decoder_start_token_id to True (1).
-        padded_decoder_attention_mask[:, padding_length - 1] = 1
-        # Generate text with padded inputs
-        outputs_with_padding = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=padded_decoder_input_ids,
-            decoder_attention_mask=padded_decoder_attention_mask,
-            do_sample=False,
-            penalty_alpha=penalty_alpha,
-            top_k=top_k,
-            max_new_tokens=64,
-        )
-        generated_text_with_padding = tokenizer.decode(outputs_with_padding[0], skip_special_tokens=True)
-
-        # Assert that the generated texts are identical for padded and non-padded inputs
-        self.assertEqual(generated_text_no_padding, generated_text_with_padding)
-        self.assertEqual(generated_text_no_padding, "Ich muss diese Aufgabe vor Ende des Tages beenden.")
-
     def test_prepare_inputs_for_generation_decoder_llm(self):
         """Tests GenerationMixin.prepare_inputs_for_generation against expected usage with decoder-only llms."""
 
@@ -5113,7 +4816,13 @@ def test_generate_custom_cache_position(self):
                 )
 
     @pytest.mark.generate
-    def test_dola_hub_runs(self):
+    @parameterized.expand(
+        [
+            ("transformers-community/dola", {"dola_layers": "low"}),
+            ("transformers-community/contrastive-search", {"penalty_alpha": 0.6, "top_k": 4}),
+        ]
+    )
+    def test_hub_gen_strategies(self, custom_generate, extra_kwargs):
         model = AutoModelForCausalLM.from_pretrained(
             "hf-internal-testing/tiny-random-MistralForCausalLM",
             device_map=torch_device,
@@ -5123,7 +4832,7 @@ def test_dola_hub_runs(self):
             "input_ids": torch.tensor([[1, 22557, 28725, 1526, 28808]], device=torch_device),
             "attention_mask": torch.tensor([[1, 1, 1, 1, 1]], device=torch_device),
         }
-        # Sets dola generation arguments such that:
+        # Sets generation arguments such that:
         # a) no EOS is generated, to ensure generation doesn't break early
         # b) there are at least two forward passes in the main model, to ensure the input preparation of
         #    the main model is correct
@@ -5138,13 +4847,13 @@ def test_dola_hub_runs(self):
             "output_attentions": True,
             "return_dict_in_generate": True,
             "use_cache": True,
-            "dola_layers": "low",
             "trust_remote_code": True,
-            "custom_generate": "transformers-community/dola",
+            "custom_generate": custom_generate,
         }
+        generation_kwargs.update(extra_kwargs)
         torch.manual_seed(0)
-        output_dola = model.generate(**generation_kwargs, **model_inputs)
-        self.assertEqual(output_dola.sequences.shape, (1, 9))
+        output = model.generate(**generation_kwargs, **model_inputs)
+        self.assertEqual(output.sequences.shape, (1, 9))
 
 
 @require_torch
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index ded8d5f0a8e8..9d887895b941 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -1205,6 +1205,7 @@ def test_cnn_summarization_same_as_fairseq(self):
         generated_summaries = tok.batch_decode(hypotheses_batch.tolist())
         assert generated_summaries == EXPECTED
 
+    # TODO joao, manuel: remove this in v4.62.0
     @slow
     def test_contrastive_search_bart(self):
         article = (
@@ -1238,7 +1239,15 @@ def test_contrastive_search_bart(self):
             article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="pt"
         ).input_ids.to(torch_device)
 
-        outputs = bart_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64, num_beams=1)
+        outputs = bart_model.generate(
+            input_ids,
+            penalty_alpha=0.5,
+            top_k=5,
+            max_length=64,
+            num_beams=1,
+            trust_remote_code=True,
+            custom_generate="transformers-community/contrastive-search",
+        )
         generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
         self.assertListEqual(
diff --git a/tests/models/csm/test_modeling_csm.py b/tests/models/csm/test_modeling_csm.py
index fa29ec4df6e7..f81685abd091 100644
--- a/tests/models/csm/test_modeling_csm.py
+++ b/tests/models/csm/test_modeling_csm.py
@@ -292,21 +292,6 @@ def test_constrained_beam_search_generate(self):
     def test_constrained_beam_search_generate_dict_output(self):
         pass
 
-    @pytest.mark.generate
-    @unittest.skip(reason="CSM does not support contrastive search.")
-    def test_contrastive_generate(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="CSM does not support contrastive search.")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="CSM does not support contrastive search.")
-    def test_contrastive_generate_low_memory(self):
-        pass
-
     @pytest.mark.generate
     @unittest.skip(reason="CSM does not support prompt lookup decoding.")
     def test_prompt_lookup_decoding_matches_greedy_search(self):
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index 284cd4c19909..7c6a322d99cb 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -516,6 +516,7 @@ def test_model_2b_bf16_dola(self):
             dola_layers="low",
             repetition_penalty=1.2,
             trust_remote_code=True,
+            custom_generate="transformers-community/dola",
         )
         output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
         self.assertEqual(output_text, EXPECTED_TEXTS)
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 1aac2069b084..072bbd081643 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -837,6 +837,7 @@ def test_gpt2_sample(self):
             all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
         )  # token_type_ids should change output
 
+    # TODO joao, manuel: remove this in v4.62.0
     @slow
     def test_contrastive_search_gpt2(self):
         article = (
@@ -848,7 +849,14 @@ def test_contrastive_search_gpt2(self):
         gpt2_model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large").to(torch_device)
         input_ids = gpt2_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
 
-        outputs = gpt2_model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
+        outputs = gpt2_model.generate(
+            input_ids,
+            penalty_alpha=0.6,
+            top_k=4,
+            max_length=256,
+            trust_remote_code=True,
+            custom_generate="transformers-community/contrastive-search",
+        )
 
         generated_text = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
index a57dc883f3ca..b24f47c32bca 100644
--- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
+++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py
@@ -424,14 +424,6 @@ def test_config(self):
     def test_retain_grad_hidden_states_attentions(self):
         pass
 
-    @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism")
-    def test_contrastive_generate(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search not supported due to non-standard caching mechanism")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
     @unittest.skip(reason="CPU offload seems to be broken for some reason - tiny models keep hitting corner cases")
     def test_cpu_offload(self):
         pass
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index d4cf398a6da9..073660b49cf0 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -541,6 +541,7 @@ def test_gptj_sample(self):
             all(output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs)))
         )  # token_type_ids should change output
 
+    # TODO joao, manuel: remove this in v4.62.0
     @tooslow
     def test_contrastive_search_gptj(self):
         article = (
@@ -554,7 +555,14 @@ def test_contrastive_search_gptj(self):
         )
         input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
 
-        outputs = model.generate(input_ids, penalty_alpha=0.6, top_k=4, max_length=256)
+        outputs = model.generate(
+            input_ids,
+            penalty_alpha=0.6,
+            top_k=4,
+            max_length=256,
+            trust_remote_code=True,
+            custom_generate="transformers-community/contrastive-search",
+        )
         generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
         self.assertListEqual(
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 15ab5ef22b47..454b38975cdd 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -844,18 +844,6 @@ def _check_attentions_for_generate(
         """
         pass
 
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate_low_memory(self):
-        pass
-
     @unittest.skip(reason="We only test the model that takes in multiple images")
     def test_custom_4d_attention_mask(self):
         pass
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 1a31eb852ec1..199664a73d85 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -390,18 +390,6 @@ def test_flash_attn_2_generate_padding_right(self):
     def test_flash_attn_2_inference_padding_right(self):
         pass
 
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate_low_memory(self):
-        pass
-
     @unittest.skip(
         reason="Prompt lookup decoding needs a way to indicate `bad_word_ids` that should not be suggested as candidates"
     )
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index 234f6ceb8b01..97cff53643bc 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -351,18 +351,6 @@ def test_inputs_embeds():
     def test_flash_attn_2_inference_padding_right(self):
         pass
 
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate_low_memory(self):
-        pass
-
     @unittest.skip(
         reason="Prompt lookup decoding needs a way to indicate `bad_word_ids` that should not be suggested as candidates"
     )
diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
index d9c2476a1fce..62ee1be2dbe6 100644
--- a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
+++ b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py
@@ -585,14 +585,6 @@ def test_flash_attn_2_generate_reuse_cache(self):
     def test_generate_from_inputs_embeds(self):
         pass
 
-    # TODO: ydshieh
-    @pytest.mark.generate
-    @unittest.skip(
-        "Kosmos2_5ForConditionalGeneration returns `vision_model_output` which is currently not working with `stack_model_outputs`",
-    )
-    def test_beam_search_low_memory(self):
-        pass
-
     @pytest.mark.generate
     def test_left_padding_compatibility(self):
         # Overwrite because Kosmos-2.5 need to padd pixel values and pad image-attn-mask
diff --git a/tests/models/lfm2/test_modeling_lfm2.py b/tests/models/lfm2/test_modeling_lfm2.py
index 4603f54dc7f7..52d4b4d6fce1 100644
--- a/tests/models/lfm2/test_modeling_lfm2.py
+++ b/tests/models/lfm2/test_modeling_lfm2.py
@@ -75,18 +75,6 @@ def test_attention_outputs(self):
     def test_past_key_values_format(self):
         pass
 
-    @unittest.skip("Lfm2 has a special cache format which is not compatible with contrastive search")
-    def test_contrastive_generate(self):
-        pass
-
-    @unittest.skip("Lfm2 has a special cache format which is not compatible with contrastive search")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip("Lfm2 has a special cache format which is not compatible with contrastive search")
-    def test_contrastive_generate_low_memory(self):
-        pass
-
     @unittest.skip(
         "Lfm2 has a special cache format which is not compatible with compile as it has static address for conv cache"
     )
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 03458d53a37b..9217510fb0b0 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -249,7 +249,14 @@ def test_model_7b_dola_generation(self):
 
         # greedy generation outputs
         generated_ids = model.generate(
-            **model_inputs, max_new_tokens=64, top_p=None, temperature=1, do_sample=False, dola_layers="low"
+            **model_inputs,
+            max_new_tokens=64,
+            top_p=None,
+            temperature=1,
+            do_sample=False,
+            dola_layers="low",
+            trust_remote_code=True,
+            custom_generate="transformers-community/dola",
         )
         text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
         self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 200926caa294..8380b49c3735 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -192,6 +192,7 @@ def test_model_7b_dola_generation(self):
             dola_layers="low",
             repetition_penalty=1.2,
             trust_remote_code=True,
+            custom_generate="transformers-community/dola",
         )
         text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
         self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index 8ff45ff9678b..331d1aba498b 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -543,6 +543,7 @@ def test_batched_nan_fp16(self):
                 torch.isnan(outputs.logits[0]).any().item()
             )  # the first logits could contain NaNs if it fails
 
+    # TODO joao, manuel: remove this in v4.62.0
     @slow
     def test_contrastive_search_opt(self):
         article = (
@@ -555,7 +556,14 @@ def test_contrastive_search_opt(self):
         opt_model = OPTForCausalLM.from_pretrained("facebook/opt-1.3b").to(torch_device)
         input_ids = opt_tokenizer(article, return_tensors="pt").input_ids.to(torch_device)
 
-        outputs = opt_model.generate(input_ids, penalty_alpha=0.6, top_k=5, max_length=256)
+        outputs = opt_model.generate(
+            input_ids,
+            penalty_alpha=0.6,
+            top_k=5,
+            max_length=256,
+            trust_remote_code=True,
+            custom_generate="transformers-community/contrastive-search",
+        )
         generated_text = opt_tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
         self.assertListEqual(
diff --git a/tests/models/paligemma2/test_modeling_paligemma2.py b/tests/models/paligemma2/test_modeling_paligemma2.py
index f4c211a5a6c5..ad345e70e03e 100644
--- a/tests/models/paligemma2/test_modeling_paligemma2.py
+++ b/tests/models/paligemma2/test_modeling_paligemma2.py
@@ -273,10 +273,6 @@ def test_feed_forward_chunking(self):
     def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
         pass
 
-    @unittest.skip("Low memory will be removed soon so no need to fix it")
-    def test_beam_search_low_memory(self):
-        pass
-
     @parameterized.expand([("random",), ("same",)])
     @pytest.mark.generate
     @unittest.skip("Paligemma2 does not seem to be compatible with assisted decoding")
diff --git a/tests/models/smolvlm/test_modeling_smolvlm.py b/tests/models/smolvlm/test_modeling_smolvlm.py
index c485fb92d8d3..45aec1da4ba9 100644
--- a/tests/models/smolvlm/test_modeling_smolvlm.py
+++ b/tests/models/smolvlm/test_modeling_smolvlm.py
@@ -345,18 +345,6 @@ def setUp(self):
     def test_flash_attn_2_inference_padding_right(self):
         pass
 
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(reason="Contrastive search is not implemented for VLMs that do cross-attn")
-    def test_contrastive_generate_low_memory(self):
-        pass
-
     @unittest.skip(
         reason="Prompt lookup decoding needs a way to indicate `bad_word_ids` that should not be suggested as candidates"
     )
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 1a0780089779..9de1467fa061 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -1569,6 +1569,7 @@ def test_translation_en_to_ro(self):
         translation = tok.decode(output[0])
         self.assertEqual(translation, expected_translation)
 
+    # TODO joao, manuel: remove this in v4.62.0
     @slow
     def test_contrastive_search_t5(self):
         article = (
@@ -1603,7 +1604,14 @@ def test_contrastive_search_t5(self):
             article, add_special_tokens=False, truncation=True, max_length=512, return_tensors="pt"
         ).input_ids.to(torch_device)
 
-        outputs = t5_model.generate(input_ids, penalty_alpha=0.5, top_k=5, max_length=64)
+        outputs = t5_model.generate(
+            input_ids,
+            penalty_alpha=0.5,
+            top_k=5,
+            max_length=64,
+            trust_remote_code=True,
+            custom_generate="transformers-community/contrastive-search",
+        )
         generated_text = t5_tokenizer.batch_decode(outputs, skip_special_tokens=True)
 
         # TODO: @arthur?

From c89c892e7b24a7d71831f2b35264456005030925 Mon Sep 17 00:00:00 2001
From: Manuel de Prada Corral <manueldeprada@gmail.com>
Date: Wed, 27 Aug 2025 11:45:20 +0200
Subject: [PATCH 2/5] testing that hub works the same

---
 docs/source/en/generation_strategies.md       |  22 --
 docs/source/en/internal/generation_utils.md   |   7 -
 docs/source/en/kv_cache.md                    |   2 +-
 docs/source/ja/generation_strategies.md       |  37 --
 docs/source/ja/internal/generation_utils.md   |   7 -
 docs/source/ko/generation_strategies.md       |  38 --
 docs/source/ko/internal/generation_utils.md   |   7 -
 docs/source/zh/internal/generation_utils.md   |   7 -
 src/transformers/__init__.py                  |   4 -
 src/transformers/configuration_utils.py       |   2 -
 src/transformers/generation/__init__.py       |   5 +-
 src/transformers/generation/beam_search.py    | 337 +----------------
 .../generation/configuration_utils.py         |  40 +-
 src/transformers/generation/logits_process.py | 136 -------
 src/transformers/generation/utils.py          | 343 +-----------------
 src/transformers/models/dia/generation_dia.py |   2 +-
 .../models/janus/modeling_janus.py            |   2 +-
 .../models/janus/modular_janus.py             |   2 +-
 .../models/musicgen/modeling_musicgen.py      |   4 +-
 .../modeling_musicgen_melody.py               |   4 +-
 src/transformers/utils/dummy_pt_objects.py    |  14 -
 tests/generation/test_beam_search.py          | 236 ------------
 tests/generation/test_configuration_utils.py  |  26 --
 tests/generation/test_logits_process.py       |  31 --
 tests/generation/test_utils.py                |   5 +-
 tests/utils/test_cache_utils.py               |   2 -
 tests/utils/test_configuration_utils.py       |   2 -
 27 files changed, 39 insertions(+), 1285 deletions(-)

diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 5c7d27192292..63b70899af4d 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -225,28 +225,6 @@ outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=to
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
-### Diverse beam search
-
-[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
-
-Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
-
-device = infer_device()
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.float16).to(device)
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
-```
-
 
 ## Custom generation methods
 
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index ecd4e77fc5f7..9deb926b905f 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -108,9 +108,6 @@ generation.
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
 [[autodoc]] InfNanRemoveLogitsProcessor
     - __call__
 
@@ -219,10 +216,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens
     - process
     - finalize
 
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
 [[autodoc]] ConstrainedBeamSearchScorer
     - process
     - finalize
diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md
index f58d4a995e80..f7bcd3252493 100644
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@@ -146,7 +146,7 @@ tokenizer = AutoTokenizer.from_pretrained(ckpt)
 model = AutoModelForCausalLM.from_pretrained(ckpt, dtype=torch.float16, device_map="auto")
 prompt = ["okay "*1000 + "Fun fact: The most"]
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
+beams = { "num_beams": 40, "num_return_sequences": 20, "max_new_tokens": 23, "early_stopping": True, }
 out = resilient_generate(model, **inputs, **beams)
 responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)
 ```
diff --git a/docs/source/ja/generation_strategies.md b/docs/source/ja/generation_strategies.md
index 856c4856c52f..45eec30c0765 100644
--- a/docs/source/ja/generation_strategies.md
+++ b/docs/source/ja/generation_strategies.md
@@ -241,43 +241,6 @@ time."\n\nHe added: "I am very proud of the work I have been able to do in the l
 'Das Haus ist wunderbar.'
 ```
 
-### Diverse beam search decoding
-
-多様なビームサーチデコーディング戦略は、ビームサーチ戦略の拡張であり、選択肢からより多様なビームシーケンスを生成できるようにします。この仕組みの詳細については、[Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models](https://huggingface.co/papers/1610.02424) をご参照ください。このアプローチには、`num_beams`、`num_beam_groups`、および `diversity_penalty` という3つの主要なパラメータがあります。多様性ペナルティは、出力がグループごとに異なることを保証し、ビームサーチは各グループ内で使用されます。
-
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> checkpoint = "google/pegasus-xsum"
->>> prompt = (
-...     "The Permaculture Design Principles are a set of universal design principles "
-...     "that can be applied to any location, climate and culture, and they allow us to design "
-...     "the most efficient and sustainable human habitation and food production systems. "
-...     "Permaculture is a design system that encompasses a wide variety of disciplines, such "
-...     "as ecology, landscape design, environmental science and energy conservation, and the "
-...     "Permaculture design principles are drawn from these various disciplines. Each individual "
-...     "design principle itself embodies a complete conceptual framework based on sound "
-...     "scientific principles. When we bring all these separate  principles together, we can "
-...     "create a design system that both looks at whole systems, the parts that these systems "
-...     "consist of, and how those parts interact with each other to create a complex, dynamic, "
-...     "living system. Each design principle serves as a tool that allows us to integrate all "
-...     "the separate parts of a design, referred to as elements, into a functional, synergistic, "
-...     "whole system, where the elements harmoniously interact and work together in the most "
-...     "efficient way possible."
-... )
-
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-
->>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0)
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'The Design Principles are a set of universal design principles that can be applied to any location, climate and
-culture, and they allow us to design the'
-```
-
 ### Assisted Decoding
 
 アシストデコーディングは、上記のデコーディング戦略を変更したもので、同じトークナイザー（理想的にははるかに小さなモデル）を使用して、いくつかの候補トークンを貪欲に生成するアシスタントモデルを使用します。その後、主要なモデルは候補トークンを1つの前向きパスで検証し、デコーディングプロセスを高速化します。現在、アシストデコーディングでは貪欲検索とサンプリングのみがサポートされており、バッチ入力はサポートされていません。アシストデコーディングの詳細については、[このブログ記事](https://huggingface.co/blog/assisted-generation) をご覧ください。
diff --git a/docs/source/ja/internal/generation_utils.md b/docs/source/ja/internal/generation_utils.md
index 1a5cc1dec079..c01d86f54bc0 100644
--- a/docs/source/ja/internal/generation_utils.md
+++ b/docs/source/ja/internal/generation_utils.md
@@ -139,9 +139,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
 [[autodoc]] InfNanRemoveLogitsProcessor
     - __call__
 
@@ -321,10 +318,6 @@ generation_output[:2]
     - process
     - finalize
 
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
 [[autodoc]] ConstrainedBeamSearchScorer
     - process
     - finalize
diff --git a/docs/source/ko/generation_strategies.md b/docs/source/ko/generation_strategies.md
index da38e4f418f2..c59eff4111f3 100644
--- a/docs/source/ko/generation_strategies.md
+++ b/docs/source/ko/generation_strategies.md
@@ -232,44 +232,6 @@ time."\n\nHe added: "I am very proud of the work I have been able to do in the l
 'Das Haus ist wunderbar.'
 ```
 
-### 다양한 빔 탐색 디코딩(Diverse beam search decoding)[[diverse-beam-search-decoding]]
-
-다양한 빔 탐색(Decoding) 전략은 선택할 수 있는 더 다양한 빔 시퀀스 집합을 생성할 수 있게 해주는 빔 탐색 전략의 확장입니다. 이 방법은 어떻게 작동하는지 알아보려면, [다양한 빔 탐색: 신경 시퀀스 모델에서 다양한 솔루션 디코딩하기](https://huggingface.co/papers/1610.02424)를 참조하세요. 이 접근 방식은 세 가지 주요 매개변수를 가지고 있습니다: `num_beams`, `num_beam_groups`, 그리고 `diversity_penalty`. 다양성 패널티는 그룹 간에 출력이 서로 다르게 하기 위한 것이며, 각 그룹 내에서 빔 탐색이 사용됩니다.
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> checkpoint = "google/pegasus-xsum"
->>> prompt = (
-...     "The Permaculture Design Principles are a set of universal design principles "
-...     "that can be applied to any location, climate and culture, and they allow us to design "
-...     "the most efficient and sustainable human habitation and food production systems. "
-...     "Permaculture is a design system that encompasses a wide variety of disciplines, such "
-...     "as ecology, landscape design, environmental science and energy conservation, and the "
-...     "Permaculture design principles are drawn from these various disciplines. Each individual "
-...     "design principle itself embodies a complete conceptual framework based on sound "
-...     "scientific principles. When we bring all these separate  principles together, we can "
-...     "create a design system that both looks at whole systems, the parts that these systems "
-...     "consist of, and how those parts interact with each other to create a complex, dynamic, "
-...     "living system. Each design principle serves as a tool that allows us to integrate all "
-...     "the separate parts of a design, referred to as elements, into a functional, synergistic, "
-...     "whole system, where the elements harmoniously interact and work together in the most "
-...     "efficient way possible."
-... )
-
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-
->>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0)
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'The Design Principles are a set of universal design principles that can be applied to any location, climate and
-culture, and they allow us to design the'
-```
-
-이 가이드에서는 다양한 디코딩 전략을 가능하게 하는 주요 매개변수를 보여줍니다. [`generate`] 메서드에 대한 고급 매개변수가 존재하므로 [`generate`] 메서드의 동작을 더욱 세부적으로 제어할 수 있습니다. 사용 가능한 매개변수의 전체 목록은 [API 문서](./main_classes/text_generation)를 참조하세요.
-
 ### 추론 디코딩(Speculative Decoding)[[speculative-decoding]]
 
 추론 디코딩(보조 디코딩(assisted decoding)으로도 알려짐)은 동일한 토크나이저를 사용하는 훨씬 작은 보조 모델을 활용하여 몇 가지 후보 토큰을 생성하는 상위 모델의 디코딩 전략을 수정한 것입니다. 주 모델은 단일 전방 통과로 후보 토큰을 검증함으로써 디코딩 과정을 가속화합니다. `do_sample=True`일 경우, [추론 디코딩 논문](https://huggingface.co/papers/2211.17192)에 소개된 토큰 검증과 재샘플링 방식이 사용됩니다.
diff --git a/docs/source/ko/internal/generation_utils.md b/docs/source/ko/internal/generation_utils.md
index bf567920610c..9bd669e34d2b 100644
--- a/docs/source/ko/internal/generation_utils.md
+++ b/docs/source/ko/internal/generation_utils.md
@@ -131,9 +131,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
 [[autodoc]] InfNanRemoveLogitsProcessor
     - __call__
 
@@ -326,10 +323,6 @@ generation_output[:2]
     - process
     - finalize
 
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
 [[autodoc]] ConstrainedBeamSearchScorer
     - process
     - finalize
diff --git a/docs/source/zh/internal/generation_utils.md b/docs/source/zh/internal/generation_utils.md
index 084e2a29dc8c..b33ac4be9c92 100644
--- a/docs/source/zh/internal/generation_utils.md
+++ b/docs/source/zh/internal/generation_utils.md
@@ -133,9 +133,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
 [[autodoc]] InfNanRemoveLogitsProcessor
     - __call__
 
@@ -316,10 +313,6 @@ generation_output[:2]
     - process
     - finalize
 
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
 [[autodoc]] ConstrainedBeamSearchScorer
     - process
     - finalize
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2c0e6f45bf1b..0d947ef7adcf 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -410,7 +410,6 @@
             "BayesianDetectorConfig",
             "BayesianDetectorModel",
             "BeamScorer",
-            "BeamSearchScorer",
             "ClassifierFreeGuidanceLogitsProcessor",
             "ConstrainedBeamSearchScorer",
             "Constraint",
@@ -425,7 +424,6 @@
             "ForcedBOSTokenLogitsProcessor",
             "ForcedEOSTokenLogitsProcessor",
             "GenerationMixin",
-            "HammingDiversityLogitsProcessor",
             "InfNanRemoveLogitsProcessor",
             "LogitNormalization",
             "LogitsProcessor",
@@ -655,7 +653,6 @@
     from .generation import BayesianDetectorConfig as BayesianDetectorConfig
     from .generation import BayesianDetectorModel as BayesianDetectorModel
     from .generation import BeamScorer as BeamScorer
-    from .generation import BeamSearchScorer as BeamSearchScorer
     from .generation import ClassifierFreeGuidanceLogitsProcessor as ClassifierFreeGuidanceLogitsProcessor
     from .generation import CompileConfig as CompileConfig
     from .generation import ConstrainedBeamSearchScorer as ConstrainedBeamSearchScorer
@@ -686,7 +683,6 @@
     from .generation import ForcedEOSTokenLogitsProcessor as ForcedEOSTokenLogitsProcessor
     from .generation import GenerationConfig as GenerationConfig
     from .generation import GenerationMixin as GenerationMixin
-    from .generation import HammingDiversityLogitsProcessor as HammingDiversityLogitsProcessor
     from .generation import InfNanRemoveLogitsProcessor as InfNanRemoveLogitsProcessor
     from .generation import LogitNormalization as LogitNormalization
     from .generation import LogitsProcessor as LogitsProcessor
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index a290fcfc733b..035e8ce791a6 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -1121,8 +1121,6 @@ def _get_global_generation_defaults() -> dict[str, Any]:
             "do_sample": False,
             "early_stopping": False,
             "num_beams": 1,
-            "num_beam_groups": 1,
-            "diversity_penalty": 0.0,
             "temperature": 1.0,
             "top_k": 50,
             "top_p": 1.0,
diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py
index 64ebfe6fc7c3..4fb3d32213f8 100644
--- a/src/transformers/generation/__init__.py
+++ b/src/transformers/generation/__init__.py
@@ -44,7 +44,6 @@
     _import_structure["beam_search"] = [
         "BeamHypotheses",
         "BeamScorer",
-        "BeamSearchScorer",
         "ConstrainedBeamSearchScorer",
     ]
     _import_structure["candidate_generator"] = [
@@ -63,7 +62,6 @@
         "ExponentialDecayLengthPenalty",
         "ForcedBOSTokenLogitsProcessor",
         "ForcedEOSTokenLogitsProcessor",
-        "HammingDiversityLogitsProcessor",
         "InfNanRemoveLogitsProcessor",
         "LogitNormalization",
         "LogitsProcessor",
@@ -209,7 +207,7 @@
         pass
     else:
         from .beam_constraints import Constraint, ConstraintListState, DisjunctiveConstraint, PhrasalConstraint
-        from .beam_search import BeamHypotheses, BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
+        from .beam_search import BeamHypotheses, BeamScorer, ConstrainedBeamSearchScorer
         from .candidate_generator import (
             AssistedCandidateGenerator,
             CandidateGenerator,
@@ -227,7 +225,6 @@
             ExponentialDecayLengthPenalty,
             ForcedBOSTokenLogitsProcessor,
             ForcedEOSTokenLogitsProcessor,
-            HammingDiversityLogitsProcessor,
             InfNanRemoveLogitsProcessor,
             LogitNormalization,
             LogitsProcessor,
diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index b6647760b790..08af5755e3d7 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -45,8 +45,6 @@
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
         beam_indices (`torch.LongTensor`, *optional*):
             Beam indices indicating to which beam hypothesis each token correspond.
-        group_index (`int`, *optional*):
-            The index of the group of beams. Used with [`~PreTrainedModel.group_beam_search`].
 
     Return:
         `UserDict`: A dictionary composed of the fields as defined above:
@@ -120,302 +118,6 @@ def finalize(
         raise NotImplementedError("This is an abstract method.")
 
 
-class BeamSearchScorer(BeamScorer):
-    r"""
-    [`BeamScorer`] implementing standard beam search decoding.
-
-    Adapted in part from [Facebook's XLM beam search
-    code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
-
-    Reference for the diverse beam search algorithm and implementation [Ashwin Kalyan's DBS
-    implementation](https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua)
-
-    Args:
-        batch_size (`int`):
-            Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
-        num_beams (`int`):
-            Number of beams for beam search.
-        device (`torch.device`):
-            Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be
-            allocated.
-        length_penalty (`float`, *optional*, defaults to 1.0):
-            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
-            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
-            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
-            `length_penalty` < 0.0 encourages shorter sequences.
-        do_early_stopping (`bool` or `str`, *optional*, defaults to `False`):
-            Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
-            `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
-            heuristic is applied and the generation stops when is it very unlikely to find better candidates;
-            `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
-            beam search algorithm).
-        num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
-            The number of beam hypotheses that shall be returned upon calling
-            [`~transformers.BeamSearchScorer.finalize`].
-        num_beam_groups (`int`, *optional*, defaults to 1):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            See [this paper](https://huggingface.co/papers/1610.02424) for more details.
-        max_length (`int`, *optional*):
-            The maximum length of the sequence to be generated.
-    """
-
-    def __init__(
-        self,
-        batch_size: int,
-        num_beams: int,
-        device: torch.device,
-        length_penalty: Optional[float] = 1.0,
-        do_early_stopping: Optional[Union[bool, str]] = False,
-        num_beam_hyps_to_keep: Optional[int] = 1,
-        num_beam_groups: Optional[int] = 1,
-        max_length: Optional[int] = None,
-    ):
-        self.num_beams = num_beams
-        self.device = device
-        self.length_penalty = length_penalty
-        self.do_early_stopping = do_early_stopping
-        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-        self.num_beam_groups = num_beam_groups
-        self.group_size = self.num_beams // self.num_beam_groups
-
-        self._is_init = False
-        # self._beam_hyps[i*self.num_beam_groups+j] is the beam_hyps of the j-th group in the i-th mini-batch.
-        # If group_beam_search is not used, the list consists of `batch_size` beam_hyps.
-        self._beam_hyps = [
-            BeamHypotheses(
-                num_beams=self.group_size,
-                length_penalty=self.length_penalty,
-                early_stopping=self.do_early_stopping,
-                max_length=max_length,
-            )
-            for _ in range(batch_size * self.num_beam_groups)
-        ]
-        # self._done[i*self.num_beam_groups+j] indicates whether the generation of the beam_hyps of the j-th group
-        # in the i-th mini-batch is complete.
-        self._done = torch.tensor(
-            [False for _ in range(batch_size * self.num_beam_groups)], dtype=torch.bool, device=self.device
-        )
-
-        if not isinstance(num_beams, int) or num_beams <= 1:
-            raise ValueError(
-                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1,"
-                " one should make use of `greedy_search` instead."
-            )
-
-        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
-            raise ValueError(
-                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be"
-                f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
-            )
-
-    @property
-    def is_done(self) -> bool:
-        return self._done.all()
-
-    def process(
-        self,
-        input_ids: torch.LongTensor,
-        next_scores: torch.FloatTensor,
-        next_tokens: torch.LongTensor,
-        next_indices: torch.LongTensor,
-        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
-        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
-        beam_indices: Optional[torch.LongTensor] = None,
-        group_index: Optional[int] = 0,
-        decoder_prompt_len: Optional[int] = 0,
-    ) -> dict[str, torch.Tensor]:
-        # add up to the length which the next_scores is calculated on (including decoder prompt)
-        cur_len = input_ids.shape[-1] + 1
-        batch_size = len(self._beam_hyps) // self.num_beam_groups
-
-        if batch_size != (input_ids.shape[0] // self.group_size):
-            if self.num_beam_groups > 1:
-                raise ValueError(
-                    f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam "
-                    f"size of {self.group_size} is expected by the beam scorer."
-                )
-            else:
-                raise ValueError(
-                    f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
-                    f"{self.group_size} is expected by the beam scorer."
-                )
-
-        device = input_ids.device
-        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
-        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
-        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
-
-        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
-            if isinstance(eos_token_id, int):
-                eos_token_id = [eos_token_id]
-            eos_token_id = torch.tensor(eos_token_id)
-
-        for batch_idx in range(batch_size):
-            batch_group_idx = batch_idx * self.num_beam_groups + group_index
-            if self._done[batch_group_idx]:
-                if self.num_beams < len(self._beam_hyps[batch_group_idx]):
-                    raise ValueError(f"Batch can only be done if at least {self.num_beams} beams have been generated")
-                if eos_token_id is None or pad_token_id is None:
-                    raise ValueError("Generated beams >= num_beams -> eos_token_id and pad_token have to be defined")
-                # pad the batch
-                next_beam_scores[batch_idx, :] = 0
-                next_beam_tokens[batch_idx, :] = pad_token_id
-                next_beam_indices[batch_idx, :] = 0
-                continue
-
-            # next tokens for this sentence
-            beam_idx = 0
-            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
-                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
-            ):
-                batch_beam_idx = batch_idx * self.group_size + next_index
-                # add to generated hypotheses if end of sentence
-                if (eos_token_id is not None) and (next_token.item() in eos_token_id):
-                    # if beam_token does not belong to top num_beams tokens, it should not be added
-                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
-                    if is_beam_token_worse_than_top_num_beams:
-                        continue
-                    if beam_indices is not None:
-                        beam_index = beam_indices[batch_beam_idx]
-                        beam_index = beam_index + (batch_beam_idx,)
-                    else:
-                        beam_index = None
-
-                    self._beam_hyps[batch_group_idx].add(
-                        input_ids[batch_beam_idx].clone(),
-                        next_score.item(),
-                        beam_indices=beam_index,
-                        generated_len=cur_len - decoder_prompt_len,
-                    )
-                else:
-                    # add next predicted token since it is not eos_token
-                    next_beam_scores[batch_idx, beam_idx] = next_score
-                    next_beam_tokens[batch_idx, beam_idx] = next_token
-                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
-                    beam_idx += 1
-
-                # once the beam for next step is full, don't add more tokens to it.
-                if beam_idx == self.group_size:
-                    break
-
-            if beam_idx < self.group_size:
-                raise ValueError(
-                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
-                    f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
-                )
-
-            # Check if we are done so that we can save a pad step if all(done)
-            self._done[batch_group_idx] = self._done[batch_group_idx] or self._beam_hyps[batch_group_idx].is_done(
-                next_scores[batch_idx].max().item(), cur_len, decoder_prompt_len
-            )
-
-        return UserDict(
-            {
-                "next_beam_scores": next_beam_scores.view(-1),
-                "next_beam_tokens": next_beam_tokens.view(-1),
-                "next_beam_indices": next_beam_indices.view(-1),
-            }
-        )
-
-    def finalize(
-        self,
-        input_ids: torch.LongTensor,
-        final_beam_scores: torch.FloatTensor,
-        final_beam_tokens: torch.LongTensor,
-        final_beam_indices: torch.LongTensor,
-        max_length: int,
-        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
-        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
-        beam_indices: Optional[torch.LongTensor] = None,
-        decoder_prompt_len: Optional[int] = 0,
-    ) -> tuple[torch.LongTensor]:
-        batch_size = len(self._beam_hyps) // self.num_beam_groups
-
-        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
-            if isinstance(eos_token_id, int):
-                eos_token_id = [eos_token_id]
-            eos_token_id = torch.tensor(eos_token_id)
-
-        # finalize all open beam hypotheses and add to generated hypotheses
-        for batch_group_idx, beam_hyp in enumerate(self._beam_hyps):
-            if self._done[batch_group_idx]:
-                continue
-
-            # all open beam hypotheses are added to the beam hypothesis
-            # beam hypothesis class automatically keeps the best beams
-            for index_per_group in range(self.group_size):
-                batch_beam_idx = batch_group_idx * self.group_size + index_per_group
-                final_score = final_beam_scores[batch_beam_idx].item()
-                final_tokens = input_ids[batch_beam_idx]
-                beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
-                generated_len = final_tokens.shape[-1] - decoder_prompt_len
-                beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
-
-        # select the best hypotheses
-        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
-        best = []
-        best_indices = []
-        best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
-
-        # retrieve best hypotheses
-        for i in range(batch_size):
-            beam_hyps_in_batch = self._beam_hyps[i * self.num_beam_groups : (i + 1) * self.num_beam_groups]
-            candidate_beams = [beam for beam_hyp in beam_hyps_in_batch for beam in beam_hyp.beams]
-            sorted_hyps = sorted(candidate_beams, key=lambda x: x[0])
-            for j in range(self.num_beam_hyps_to_keep):
-                best_hyp_tuple = sorted_hyps.pop()
-                best_score = best_hyp_tuple[0]
-                best_hyp = best_hyp_tuple[1]
-                best_index = best_hyp_tuple[2]
-                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
-
-                # append hyp to lists
-                best.append(best_hyp)
-
-                # append indices to list
-                best_indices.append(best_index)
-
-                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
-
-        # prepare for adding eos
-        sent_lengths_max = sent_lengths.max().item() + 1
-        sent_max_len = min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
-        decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
-
-        if len(best_indices) > 0 and best_indices[0] is not None:
-            indices: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
-        else:
-            indices = None
-
-        # shorter batches are padded if needed
-        if sent_lengths.min().item() != sent_lengths.max().item():
-            if pad_token_id is None:
-                raise ValueError("`pad_token_id` has to be defined")
-            decoded.fill_(pad_token_id)
-
-        if indices is not None:
-            indices.fill_(-1)
-
-        # fill with hypotheses and eos_token_id if the latter fits in
-        for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
-            decoded[i, : sent_lengths[i]] = hypo
-
-            if indices is not None:
-                indices[i, : len(best_idx)] = torch.tensor(best_idx)
-
-            if sent_lengths[i] < sent_max_len:
-                # inserting only the first eos_token_id
-                decoded[i, sent_lengths[i]] = eos_token_id[0]
-
-        return UserDict(
-            {
-                "sequences": decoded,
-                "sequence_scores": best_scores,
-                "beam_indices": indices,
-            }
-        )
-
-
 class ConstrainedBeamSearchScorer(BeamScorer):
     r"""
     [`BeamScorer`] implementing constrained beam search decoding.
@@ -446,9 +148,6 @@ class ConstrainedBeamSearchScorer(BeamScorer):
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
             The number of beam hypotheses that shall be returned upon calling
             [`~transformers.BeamSearchScorer.finalize`].
-        num_beam_groups (`int`, *optional*, defaults to 1):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            See [this paper](https://huggingface.co/papers/1610.02424) for more details.
         max_length (`int`, *optional*):
             The maximum length of the sequence to be generated.
     """
@@ -462,7 +161,6 @@ def __init__(
         length_penalty: Optional[float] = 1.0,
         do_early_stopping: Optional[Union[bool, str]] = False,
         num_beam_hyps_to_keep: Optional[int] = 1,
-        num_beam_groups: Optional[int] = 1,
         max_length: Optional[int] = None,
     ):
         self.num_beams = num_beams
@@ -470,8 +168,6 @@ def __init__(
         self.length_penalty = length_penalty
         self.do_early_stopping = do_early_stopping
         self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-        self.num_beam_groups = num_beam_groups
-        self.group_size = self.num_beams // self.num_beam_groups
         self.constraints = constraints
 
         self._is_init = False
@@ -492,12 +188,6 @@ def __init__(
                 " one should make use of `greedy_search` instead."
             )
 
-        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
-            raise ValueError(
-                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be"
-                f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
-            )
-
     @property
     def is_done(self) -> bool:
         return self._done.all()
@@ -564,23 +254,12 @@ def process(
         # add up to the length which the next_scores is calculated on (including decoder prompt)
         cur_len = input_ids.shape[-1] + 1
         batch_size = len(self._beam_hyps)
-        if batch_size != (input_ids.shape[0] // self.group_size):
-            if self.num_beam_groups > 1:
-                raise ValueError(
-                    f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam "
-                    f"size of {self.group_size} is expected by the beam scorer."
-                )
-            else:
-                raise ValueError(
-                    f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
-                    f"{self.group_size} is expected by the beam scorer."
-                )
 
         device = input_ids.device
 
-        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
-        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
-        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
+        next_beam_scores = torch.zeros((batch_size, self.num_beams), dtype=next_scores.dtype, device=device)
+        next_beam_tokens = torch.zeros((batch_size, self.num_beams), dtype=next_tokens.dtype, device=device)
+        next_beam_indices = torch.zeros((batch_size, self.num_beams), dtype=next_indices.dtype, device=device)
 
         if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
             if isinstance(eos_token_id, int):
@@ -604,11 +283,11 @@ def process(
             for beam_token_rank, (next_token, next_score, next_index) in enumerate(
                 zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
             ):
-                batch_beam_idx = batch_idx * self.group_size + next_index
+                batch_beam_idx = batch_idx * self.num_beams + next_index
                 # add to generated hypotheses if end of sentence
                 if (eos_token_id is not None) and (next_token.item() in eos_token_id):
                     # if beam_token does not belong to top num_beams tokens, it should not be added
-                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.num_beams
                     if is_beam_token_worse_than_top_num_beams:
                         continue
 
@@ -634,7 +313,7 @@ def process(
                     beam_idx += 1
 
                 # once the beam for next step is full, don't add more tokens to it.
-                if beam_idx == self.group_size:
+                if beam_idx == self.num_beams:
                     break
 
             new_scores, new_tokens, new_indices = self.step_sentence_constraint(
@@ -650,9 +329,9 @@ def process(
             next_beam_tokens[batch_idx] = new_tokens
             next_beam_indices[batch_idx] = new_indices
 
-            if beam_idx < self.group_size:
+            if beam_idx < self.num_beams:
                 raise ValueError(
-                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
+                    f"At most {self.num_beams} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
                     f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
                 )
 
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 1edaf19948e8..177fa8064857 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -89,7 +89,6 @@ class GenerationConfig(PushToHubMixin):
         - *multinomial sampling* if `num_beams=1` and `do_sample=True`
         - *beam-search decoding* if `num_beams>1` and `do_sample=False`
         - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
-        - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
         - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
         - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
 
@@ -134,9 +133,6 @@ class GenerationConfig(PushToHubMixin):
             Whether or not to use sampling ; use greedy decoding otherwise.
         num_beams (`int`, *optional*, defaults to 1):
             Number of beams for beam search. 1 means no beam search.
-        num_beam_groups (`int`, *optional*, defaults to 1):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            [this paper](https://huggingface.co/papers/1610.02424) for more details.
 
         > Parameters that control the cache
 
@@ -190,9 +186,6 @@ class GenerationConfig(PushToHubMixin):
             probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3,
             depending on the size of the model. See [Truncation Sampling as Language Model
             Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
-        diversity_penalty (`float`, *optional*, defaults to 0.0):
-            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
-            particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
         repetition_penalty (`float`, *optional*, defaults to 1.0):
             The parameter for repetition penalty. 1.0 means no penalty. See [this
             paper](https://huggingface.co/papers/1909.05858) for more details.
@@ -359,7 +352,6 @@ def __init__(self, **kwargs):
         # Parameters that control the generation strategy used
         self.do_sample = kwargs.pop("do_sample", False)
         self.num_beams = kwargs.pop("num_beams", 1)
-        self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
 
         # Parameters that control the cache
         self.use_cache = kwargs.pop("use_cache", True)
@@ -377,7 +369,6 @@ def __init__(self, **kwargs):
         self.typical_p = kwargs.pop("typical_p", 1.0)
         self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", 0.0)
         self.eta_cutoff = kwargs.pop("eta_cutoff", 0.0)
-        self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
         self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
         self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", 1.0)
         self.length_penalty = kwargs.pop("length_penalty", 1.0)
@@ -441,6 +432,8 @@ def __init__(self, **kwargs):
         self.low_memory = kwargs.pop("low_memory", None)
         self.penalty_alpha = kwargs.pop("penalty_alpha", None)
         self.dola_layers = kwargs.pop("dola_layers", None)
+        self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
+        self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
 
         # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
         # interface.
@@ -628,14 +621,6 @@ def validate(self, strict=False):
                 minor_issues["early_stopping"] = single_beam_wrong_parameter_msg.format(
                     flag_name="early_stopping", flag_value=self.early_stopping
                 )
-            if self.num_beam_groups is not None and self.num_beam_groups != 1:
-                minor_issues["num_beam_groups"] = single_beam_wrong_parameter_msg.format(
-                    flag_name="num_beam_groups", flag_value=self.num_beam_groups
-                )
-            if self.diversity_penalty is not None and self.diversity_penalty != 0.0:
-                minor_issues["diversity_penalty"] = single_beam_wrong_parameter_msg.format(
-                    flag_name="diversity_penalty", flag_value=self.diversity_penalty
-                )
             if self.length_penalty is not None and self.length_penalty != 1.0:
                 minor_issues["length_penalty"] = single_beam_wrong_parameter_msg.format(
                     flag_name="length_penalty", flag_value=self.length_penalty
@@ -658,27 +643,6 @@ def validate(self, strict=False):
                     raise ValueError(
                         constrained_wrong_parameter_msg.format(flag_name="do_sample", flag_value=self.do_sample)
                     )
-                if self.num_beam_groups is not None and self.num_beam_groups != 1:
-                    raise ValueError(
-                        constrained_wrong_parameter_msg.format(
-                            flag_name="num_beam_groups", flag_value=self.num_beam_groups
-                        )
-                    )
-            # group beam search
-            elif self.diversity_penalty != 0.0 or self.num_beam_groups != 1:
-                group_error_prefix = (
-                    "`diversity_penalty` is not 0.0 or `num_beam_groups` is not 1, triggering group beam search. In "
-                    "this generation mode, "
-                )
-                if self.do_sample is True:
-                    raise ValueError(group_error_prefix + "`do_sample` must be set to `False`")
-                if self.num_beams % self.num_beam_groups != 0:
-                    raise ValueError(group_error_prefix + "`num_beams` should be divisible by `num_beam_groups`")
-                if self.diversity_penalty == 0.0:
-                    raise ValueError(
-                        group_error_prefix
-                        + "`diversity_penalty` should be greater than `0.0`, otherwise your groups will be identical."
-                    )
 
         # 2.4. check `num_return_sequences`
         if self.num_return_sequences != 1:
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 14b4b54aa1c5..abc08ef2eb5c 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1441,142 +1441,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         return scores_processed
 
 
-class HammingDiversityLogitsProcessor(LogitsProcessor):
-    r"""
-    [`LogitsProcessor`] that enforces diverse beam search.
-
-    Note that this logits processor is only effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam
-    Search: Decoding Diverse Solutions from Neural Sequence Models](https://huggingface.co/papers/1610.02424) for more
-    details.
-
-    Traditional beam search often generates very similar sequences across different beams.
-    `HammingDiversityLogitsProcessor` addresses this by penalizing beams that generate tokens already chosen by other
-    beams in the same time step.
-
-    Args:
-        diversity_penalty (`float`):
-            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
-            particular time. A higher `diversity_penalty` will enforce greater diversity among the beams. Adjusting
-            this value can help strike a balance between diversity and natural likelihood.
-        num_beams (`int`):
-            Number of beams for beam search. 1 means no beam search.
-        num_beam_groups (`int`):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            [this paper](https://huggingface.co/papers/1610.02424) for more details.
-
-    Examples:
-
-    ```python
-    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-    >>> import torch
-
-    >>> # Initialize the model and tokenizer
-    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-    >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
-
-    >>> # A long text about the solar system
-    >>> text = (
-    ...     "The Solar System is a gravitationally bound system comprising the Sun and the objects that orbit it, "
-    ...     "either directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight "
-    ...     "planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System "
-    ...     "bodies. The Solar System formed 4.6 billion years ago from the gravitational collapse of a giant "
-    ...     "interstellar molecular cloud."
-    ... )
-    >>> inputs = tokenizer("summarize: " + text, return_tensors="pt")
-
-    >>> # Generate diverse summary
-    >>> outputs_diverse = model.generate(
-    ...     **inputs,
-    ...     num_beam_groups=2,
-    ...     diversity_penalty=10.0,
-    ...     max_length=100,
-    ...     num_beams=4,
-    ...     num_return_sequences=2,
-    ... )
-    >>> summaries_diverse = tokenizer.batch_decode(outputs_diverse, skip_special_tokens=True)
-
-    >>> # Generate non-diverse summary
-    >>> outputs_non_diverse = model.generate(
-    ...     **inputs,
-    ...     max_length=100,
-    ...     num_beams=4,
-    ...     num_return_sequences=2,
-    ... )
-    >>> summary_non_diverse = tokenizer.batch_decode(outputs_non_diverse, skip_special_tokens=True)
-
-    >>> # With `diversity_penalty`, the resulting beams are much more diverse
-    >>> print(summary_non_diverse)
-    ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.',
-    'the Solar System formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.']
-
-    >>> print(summaries_diverse)
-    ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.',
-    'the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets. the rest of the objects are smaller objects, such as the five dwarf planets and small solar system bodies.']
-    ```
-    """
-
-    def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
-        if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0):
-            raise ValueError("`diversity_penalty` should be a float strictly larger than 0.")
-        self._diversity_penalty = diversity_penalty
-        if not isinstance(num_beams, int) or num_beams < 2:
-            raise ValueError("`num_beams` should be an integer strictly larger than 1.")
-        self._num_beams = num_beams
-        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
-            raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.")
-        if num_beam_groups > num_beams:
-            raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.")
-        self._num_sub_beams = num_beams // num_beam_groups
-
-    def __call__(
-        self,
-        input_ids: torch.LongTensor,
-        scores: torch.FloatTensor,
-        current_tokens: torch.LongTensor,
-        beam_group_idx: int,
-    ) -> torch.FloatTensor:
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
-            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
-                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
-                beam search or log softmax for each vocabulary token when using beam search
-            current_tokens (`torch.LongTensor` of shape `(batch_size)`):
-                Indices of input sequence tokens in the vocabulary, corresponding to the tokens selected by the other
-                beam groups in the current generation step.
-            beam_group_idx (`int`):
-                The index of the beam group currently being processed.
-
-        Return:
-            `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`:
-                The processed prediction scores.
-        """
-        # hamming diversity: penalise using same token in current group which was used in previous groups at
-        # the same time step
-        batch_size = current_tokens.shape[0] // self._num_beams
-        group_start_idx = beam_group_idx * self._num_sub_beams
-        group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams)
-        group_size = group_end_idx - group_start_idx
-        vocab_size = scores.shape[-1]
-
-        if group_start_idx == 0:
-            return scores
-
-        scores_processed = scores.clone()
-        for batch_idx in range(batch_size):
-            # predicted tokens of last time step of previous groups
-            previous_group_tokens = current_tokens[
-                batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
-            ]
-            token_frequency = torch.bincount(previous_group_tokens, minlength=vocab_size).to(scores.device)
-            scores_processed[batch_idx * group_size : (batch_idx + 1) * group_size] -= (
-                self._diversity_penalty * token_frequency
-            )
-
-        return scores_processed
-
-
 class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
     r"""
     [`LogitsProcessor`] that enforces the specified token as the first generated token. Used with encoder-decoder
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index e03ad600deb3..68db1406d67d 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -54,7 +54,7 @@
     logging,
 )
 from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
-from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
+from .beam_search import ConstrainedBeamSearchScorer
 from .candidate_generator import (
     AssistantVocabTranslatorCache,
     AssistedCandidateGenerator,
@@ -82,7 +82,6 @@
     ExponentialDecayLengthPenalty,
     ForcedBOSTokenLogitsProcessor,
     ForcedEOSTokenLogitsProcessor,
-    HammingDiversityLogitsProcessor,
     InfNanRemoveLogitsProcessor,
     LogitNormalization,
     LogitsProcessorList,
@@ -371,7 +370,6 @@ class GenerationMixin(ContinuousMixin):
         - *multinomial sampling* if `num_beams=1` and `do_sample=True`
         - *beam-search decoding* if `num_beams>1` and `do_sample=False`
         - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
-        - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
         - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
         - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
 
@@ -1114,14 +1112,6 @@ def _get_logits_processor(
         if generation_config.sequence_bias is not None:
             processors.append(SequenceBiasLogitsProcessor(sequence_bias=generation_config.sequence_bias))
 
-        if generation_config.diversity_penalty is not None and generation_config.diversity_penalty > 0.0:
-            processors.append(
-                HammingDiversityLogitsProcessor(
-                    diversity_penalty=generation_config.diversity_penalty,
-                    num_beams=generation_config.num_beams,
-                    num_beam_groups=generation_config.num_beam_groups,
-                )
-            )
         if (
             generation_config.encoder_repetition_penalty is not None
             and generation_config.encoder_repetition_penalty != 1.0
@@ -1196,7 +1186,7 @@ def _get_logits_processor(
             processors.append(
                 PrefixConstrainedLogitsProcessor(
                     prefix_allowed_tokens_fn,
-                    generation_config.num_beams // generation_config.num_beam_groups,
+                    generation_config.num_beams,
                 )
             )
         if generation_config.forced_bos_token_id is not None:
@@ -2559,28 +2549,22 @@ def generate(
 
         elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH:
             logger.warning_once(
-                "Group Beam Search is scheduled to be moved to a `custom_generate` repository in v4.55.0. "
-                "To prevent loss of backward compatibility, add `trust_remote_code=True` to your `generate` call."
-            )
-            # 11. prepare beam search scorer
-            beam_scorer = BeamSearchScorer(
-                batch_size=batch_size,
-                num_beams=generation_config.num_beams,
-                device=inputs_tensor.device,
-                length_penalty=generation_config.length_penalty,
-                do_early_stopping=generation_config.early_stopping,
-                num_beam_hyps_to_keep=generation_config.num_return_sequences,
-                num_beam_groups=generation_config.num_beam_groups,
-                max_length=generation_config.max_length,
+                "Group Beam Search was moved to a `custom_generate` repo: https://hf.co/transformers-community/group-beam-search. "
+                "To prevent loss of backward compatibility, add `custom_generate='transformers-community/group-beam-search'` "
+                "to your `generate` call before v4.62.0."
             )
-            result = self._group_beam_search(
-                input_ids,
-                beam_scorer,
-                logits_processor=prepared_logits_processor,
-                stopping_criteria=prepared_stopping_criteria,
+            if not trust_remote_code:
+                raise ValueError(
+                    "Group Beam Search requires `trust_remote_code=True` in your `generate` call, since "
+                    "it loads https://hf.co/transformers-community/group-beam-search."
+                )
+            return GenerationMixin.generate(
+                self,
+                inputs,
+                custom_generate="transformers-community/group-beam-search",
                 generation_config=generation_config,
-                synced_gpus=synced_gpus,
-                **model_kwargs,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
             )
 
         elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH:
@@ -3527,301 +3511,6 @@ def _beam_search(
         else:
             return sequences
 
-    def _group_beam_search(
-        self,
-        input_ids: torch.LongTensor,
-        beam_scorer: BeamScorer,
-        logits_processor: LogitsProcessorList,
-        stopping_criteria: StoppingCriteriaList,
-        generation_config: GenerationConfig,
-        synced_gpus: bool,
-        **model_kwargs,
-    ):
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **diverse beam search
-        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            beam_scorer (`BeamScorer`):
-                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
-                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
-            logits_processor (`LogitsProcessorList`):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            generation_config ([`~generation.GenerationConfig`]):
-                The generation configuration to be used as parametrization of the decoding method.
-            synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
-                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
-            model_kwargs:
-                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
-                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-        """
-        # init values
-        pad_token_id = generation_config._pad_token_tensor
-        eos_token_id = generation_config._eos_token_tensor
-        output_attentions = generation_config.output_attentions
-        output_hidden_states = generation_config.output_hidden_states
-        output_scores = generation_config.output_scores
-        output_logits = generation_config.output_logits
-        return_dict_in_generate = generation_config.return_dict_in_generate
-
-        num_beams = beam_scorer.num_beams
-        num_beam_groups = beam_scorer.num_beam_groups
-        num_sub_beams = num_beams // num_beam_groups
-        batch_size = len(beam_scorer._beam_hyps) // num_beam_groups
-        device = input_ids.device
-
-        batch_beam_size, cur_len = input_ids.shape
-        model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
-
-        if return_dict_in_generate and output_scores:
-            beam_indices = [tuple(() for _ in range(num_sub_beams * batch_size)) for _ in range(num_beam_groups)]
-        else:
-            beam_indices = None
-
-        if num_beams * batch_size != batch_beam_size:
-            raise ValueError(
-                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
-            )
-
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-        if return_dict_in_generate and self.config.is_encoder_decoder:
-            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-            encoder_hidden_states = (
-                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-            )
-
-        # initialise score of first beam of each group with 0 and the rest with -1e9. This ensures that the beams in
-        # the same group don't produce same tokens every time.
-        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
-        beam_scores[:, ::num_sub_beams] = 0
-        beam_scores = beam_scores.view((batch_size * num_beams,))
-
-        this_peer_finished = False
-
-        decoder_prompt_len = input_ids.shape[1]  # record the prompt length of decoder
-        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            # predicted tokens in cur_len step
-            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
-
-            # indices which will form the beams in the next time step
-            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
-
-            # do one decoder step on all beams of all sentences in batch
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-            # prepare variable output controls (note: some models won't accept all output controls)
-            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
-            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
-
-            outputs = self(**model_inputs, return_dict=True)
-
-            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs,
-                model_kwargs,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-            if synced_gpus and this_peer_finished:
-                cur_len = cur_len + 1
-                continue
-
-            if output_scores:
-                processed_score = torch.zeros_like(outputs.logits[:, -1, :])
-            if output_logits:
-                # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
-                # (the clone itself is always small)
-                raw_logit_score = outputs.logits[:, -1, :].to(copy=True, device=input_ids.device)
-
-            for beam_group_idx in range(num_beam_groups):
-                group_start_idx = beam_group_idx * num_sub_beams
-                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
-                group_size = group_end_idx - group_start_idx
-
-                # indices of beams of current group among all sentences in batch
-                batch_group_indices = []
-
-                for batch_idx in range(batch_size):
-                    batch_group_indices.extend(
-                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
-                    )
-                group_input_ids = input_ids[batch_group_indices]
-
-                # select outputs of beams of current group only
-                # No need to clone() the logits here as they will not retain outputs.logits at the end of the loop
-                # .float() is needed to retain precision for later logits manipulations
-                next_token_logits = outputs.logits[batch_group_indices, -1, :].to(
-                    dtype=torch.float32, device=input_ids.device
-                )
-
-                next_token_scores = nn.functional.log_softmax(
-                    next_token_logits, dim=-1
-                )  # (batch_size * group_size, vocab_size)
-                vocab_size = next_token_scores.shape[-1]
-
-                next_token_scores_processed = logits_processor(
-                    group_input_ids, next_token_scores, current_tokens=current_tokens, beam_group_idx=beam_group_idx
-                )
-                next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
-                next_token_scores = next_token_scores.expand_as(next_token_scores_processed)
-
-                if output_scores:
-                    processed_score[batch_group_indices] = next_token_scores_processed
-
-                # reshape for beam search
-                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
-
-                # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
-                n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
-                next_token_scores, next_tokens = torch.topk(
-                    next_token_scores, max(2, 1 + n_eos_tokens) * group_size, dim=1, largest=True, sorted=True
-                )
-
-                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
-                next_tokens = next_tokens % vocab_size
-
-                # stateless
-                process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
-                beam_outputs = beam_scorer.process(
-                    group_input_ids,
-                    next_token_scores,
-                    next_tokens,
-                    next_indices,
-                    pad_token_id=pad_token_id,
-                    eos_token_id=eos_token_id,
-                    beam_indices=process_beam_indices,
-                    group_index=beam_group_idx,
-                    decoder_prompt_len=decoder_prompt_len,
-                )
-                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
-                beam_next_tokens = beam_outputs["next_beam_tokens"]
-                beam_idx = beam_outputs["next_beam_indices"]
-
-                if return_dict_in_generate and output_scores:
-                    beam_indices[beam_group_idx] = tuple(
-                        beam_indices[beam_group_idx][beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices[0]))
-                    )
-
-                input_ids[batch_group_indices] = group_input_ids[beam_idx]
-                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-                current_tokens[batch_group_indices] = group_input_ids[:, -1]
-
-                # (beam_idx // group_size) -> batch_idx
-                # (beam_idx % group_size) -> offset of idx inside the group
-                reordering_indices[batch_group_indices] = (
-                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor")
-                    + group_start_idx
-                    + (beam_idx % group_size)
-                )
-
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += (processed_score,)
-                if output_logits:
-                    raw_logits += (raw_logit_score,)
-                if output_attentions:
-                    decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                    )
-                    if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
-
-                if output_hidden_states:
-                    decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
-                        if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
-                    )
-
-            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
-
-            # This is needed to properly delete outputs.logits which may be very large for first iteration
-            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
-            # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory
-            # (that way the memory peak does not include outputs.logits)
-            del outputs
-
-            # NOTE: we need to check if `self._reorder_cache` exists for special models like RAG, RecurrentGemma etc.
-            if model_kwargs.get("past_key_values", None) is not None:
-                if hasattr(self, "_reorder_cache"):
-                    model_kwargs["past_key_values"] = self._reorder_cache(
-                        model_kwargs["past_key_values"], reordering_indices
-                    )
-                else:
-                    model_kwargs["past_key_values"].reorder_cache(reordering_indices)
-
-            # increase cur_len
-            cur_len = cur_len + 1
-
-            if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
-                this_peer_finished = True
-
-        final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
-        sequence_outputs = beam_scorer.finalize(
-            input_ids,
-            beam_scores,
-            next_tokens,
-            next_indices,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            max_length=stopping_criteria.max_length,
-            beam_indices=final_beam_indices,
-            decoder_prompt_len=decoder_prompt_len,
-        )
-
-        if return_dict_in_generate:
-            if not output_scores:
-                sequence_outputs["sequence_scores"] = None
-
-            if self.config.is_encoder_decoder:
-                return GenerateBeamEncoderDecoderOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    encoder_attentions=encoder_attentions,
-                    encoder_hidden_states=encoder_hidden_states,
-                    decoder_attentions=decoder_attentions,
-                    cross_attentions=cross_attentions,
-                    decoder_hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-            else:
-                return GenerateBeamDecoderOnlyOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    attentions=decoder_attentions,
-                    hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-        else:
-            return sequence_outputs["sequences"]
-
     def _constrained_beam_search(
         self,
         input_ids: torch.LongTensor,
diff --git a/src/transformers/models/dia/generation_dia.py b/src/transformers/models/dia/generation_dia.py
index 7cac22f0d483..22b607ec2865 100644
--- a/src/transformers/models/dia/generation_dia.py
+++ b/src/transformers/models/dia/generation_dia.py
@@ -400,7 +400,7 @@ def _main_generate_loop(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
     @torch.no_grad()
diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py
index 8ee43bd29184..d241268bd3cd 100644
--- a/src/transformers/models/janus/modeling_janus.py
+++ b/src/transformers/models/janus/modeling_janus.py
@@ -1270,7 +1270,7 @@ def generate(
         if generation_config.get_generation_mode() not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
             raise ValueError(
                 "Got incompatible mode for Image Generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         # Validate the configuration and model kwargs
diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
index 2ae65710b2a5..0d15572b527b 100644
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -1130,7 +1130,7 @@ def generate(
         if generation_config.get_generation_mode() not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
             raise ValueError(
                 "Got incompatible mode for Image Generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         # Validate the configuration and model kwargs
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index c8df024e4e5a..d91a198da816 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1321,7 +1321,7 @@ def generate(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         if generation_config.return_dict_in_generate:
@@ -2371,7 +2371,7 @@ def generate(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         if generation_config.return_dict_in_generate:
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 28ef14ee15fe..ec544cb88804 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -1235,7 +1235,7 @@ def generate(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         if generation_config.return_dict_in_generate:
@@ -2236,7 +2236,7 @@ def generate(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         if generation_config.return_dict_in_generate:
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c892fb365cbc..ce3fd036aef2 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -177,13 +177,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class BeamSearchScorer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class ClassifierFreeGuidanceLogitsProcessor(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -282,13 +275,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class HammingDiversityLogitsProcessor(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class InfNanRemoveLogitsProcessor(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/generation/test_beam_search.py b/tests/generation/test_beam_search.py
index bd791bbc8fb8..69bb0a40e292 100644
--- a/tests/generation/test_beam_search.py
+++ b/tests/generation/test_beam_search.py
@@ -26,230 +26,12 @@
 
     from transformers.generation import (
         BeamHypotheses,
-        BeamSearchScorer,
         ConstrainedBeamSearchScorer,
         DisjunctiveConstraint,
         PhrasalConstraint,
     )
 
 
-class BeamSearchTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        sequence_length=10,
-        vocab_size=99,
-        pad_token_id=0,
-        max_length=20,
-        num_beams=4,
-        length_penalty=2.0,
-        do_early_stopping=True,
-        num_beam_hyps_to_keep=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.sequence_length = sequence_length
-        self.vocab_size = vocab_size
-        self.pad_token_id = pad_token_id
-        self.max_length = max_length
-        self.num_beams = num_beams
-        self.length_penalty = length_penalty
-        self.do_early_stopping = do_early_stopping
-        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-
-        # cannot be randomly generated
-        self.eos_token_id = vocab_size + 1
-
-    def prepare_beam_scorer(self, **kwargs):
-        return BeamSearchScorer(
-            batch_size=kwargs.get("batch_size", self.batch_size),
-            num_beams=kwargs.get("num_beams", self.num_beams),
-            device=torch_device,
-            length_penalty=kwargs.get("length_penalty", self.length_penalty),
-            do_early_stopping=kwargs.get("do_early_stopping", self.do_early_stopping),
-            num_beam_hyps_to_keep=kwargs.get("num_beam_hyps_to_keep", self.num_beam_hyps_to_keep),
-        )
-
-    def prepare_inputs(self):
-        input_ids = ids_tensor((self.batch_size * self.num_beams, self.sequence_length), self.vocab_size)
-        next_tokens = ids_tensor((self.batch_size, 2 * self.num_beams), self.vocab_size).to(torch_device)
-        next_indices = ids_tensor((self.batch_size, 2 * self.num_beams), self.num_beams).to(torch_device)
-        next_scores, _ = (-floats_tensor((self.batch_size, 2 * self.num_beams)).to(torch_device)).sort(descending=True)
-        return (input_ids, next_tokens, next_indices, next_scores)
-
-    def check_beam_hypotheses(self, input_ids, *args):
-        # check that correct number of beam hypotheses is set in beam scorer
-        beam_scorer = self.prepare_beam_scorer(do_early_stopping=True)
-        beam_hyp = beam_scorer._beam_hyps[0]
-
-        self.parent.assertEqual(len(beam_scorer._beam_hyps), self.batch_size)
-
-        # check correct type
-        self.parent.assertTrue(isinstance(beam_hyp, BeamHypotheses))
-
-        # check that num_beams is correctly set
-        self.parent.assertEqual(beam_hyp.num_beams, self.num_beams)
-
-        # check for early stopping deactivated
-        for beam_idx in range(self.num_beams):
-            beam_hyp.add(input_ids[beam_idx], -10.0)
-
-        # if early stopping True -> score does not matter
-        self.parent.assertTrue(beam_hyp.is_done(-10.0, 5))
-
-        # re-init
-        beam_scorer = self.prepare_beam_scorer(do_early_stopping=False)
-        beam_hyp = beam_scorer._beam_hyps[0]
-
-        # add `num_beams + 1` beams to change `worst_score`
-        for beam_idx in range(self.num_beams + 1):
-            beam_hyp.add(input_ids[beam_idx], -10.0 + float(beam_idx))
-
-        # -10.0 is removed => -9.0 is worst score
-        self.parent.assertAlmostEqual(beam_hyp.worst_score, -9.0 / (self.sequence_length**beam_hyp.length_penalty))
-
-        # -5.0 is better than worst score => should not be finished
-        self.parent.assertFalse(beam_hyp.is_done(-5.0, self.sequence_length))
-
-        # -20.0 is worse than worst score => should be finished
-        self.parent.assertTrue(beam_hyp.is_done(-20.0, self.sequence_length))
-
-    def check_beam_scorer_update(self, input_ids, next_tokens, next_indices, next_scores):
-        # check too many eos tokens
-        beam_scorer = self.prepare_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[0, :] = self.eos_token_id
-
-        with self.parent.assertRaises(ValueError):
-            beam_scorer.process(input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id)
-
-        # check all batches are done
-        beam_scorer = self.prepare_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[:, : self.num_beams] = self.eos_token_id
-        beam_indices = torch.zeros_like(input_ids) + torch.arange(input_ids.shape[-1], device=input_ids.device)
-        beam_indices = tuple(tuple(b) for b in beam_indices)
-        beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id, beam_indices=beam_indices
-        )
-        # beam scorer should be done
-        self.parent.assertTrue(beam_scorer.is_done)
-
-        # check
-        beam_scorer = self.prepare_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[:, 1] = self.eos_token_id
-        beam_outputs = beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id, beam_indices=beam_indices
-        )
-        output_scores = beam_outputs["next_beam_scores"]
-        output_tokens = beam_outputs["next_beam_tokens"]
-        output_indices = beam_outputs["next_beam_indices"]
-
-        def cut_expected_tensor(tensor):
-            return torch.cat([tensor[:, :1], tensor[:, 2 : self.num_beams + 1]], dim=1).flatten()
-
-        # check all outptus
-        # cut out id of eos token and take best `num_beams` outputs
-        expected_output_tokens = cut_expected_tensor(tokens)
-        expected_output_scores = cut_expected_tensor(next_scores)
-
-        # add num_beams * batch_idx
-        offset = torch.div(
-            torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams, rounding_mode="floor"
-        )
-        expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams
-
-        self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist())
-        self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist())
-        self.parent.assertTrue(torch.allclose(expected_output_scores, output_scores, atol=1e-3))
-
-        # make sure ids of eos token are correctly saved in beam_hyps of beam scorer
-        expected_beam_indices = list(range(10))
-        for batch_idx in range(self.batch_size):
-            correct_idx = batch_idx * self.num_beams + next_indices[batch_idx, 1]
-            self.parent.assertListEqual(
-                input_ids[correct_idx].tolist(), beam_scorer._beam_hyps[batch_idx].beams[0][1].tolist()
-            )
-            self.parent.assertListEqual(
-                expected_beam_indices + [correct_idx],
-                torch.tensor(beam_scorer._beam_hyps[batch_idx].beams[0][2]).tolist(),
-            )
-
-    def check_beam_scores_finalize(self, input_ids, next_tokens, next_indices, next_scores):
-        # max_length should be only one more than current input_ids to check that eos is correctly appended
-        max_length = self.sequence_length + 1
-        beam_scorer = self.prepare_beam_scorer(num_beam_hyps_to_keep=1, length_penalty=1.0, do_early_stopping=False)
-
-        # update beams and append to input_ids
-        tokens = next_tokens.clone()
-        # first batch, first output has to finish with eos token id since scores are correctly sorted
-        tokens[0, 0] = self.eos_token_id
-        # make sure corresponding score is as good as possible to surely be picked first
-        next_scores[0, 0] = 0.0
-        beam_outputs = beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id
-        )
-        output_scores = beam_outputs["next_beam_scores"]
-        output_tokens = beam_outputs["next_beam_tokens"]
-        output_indices = beam_outputs["next_beam_indices"]
-
-        input_ids = torch.cat([input_ids[output_indices, :], output_tokens.unsqueeze(-1)], dim=-1)
-
-        # finalize
-        beam_indices = torch.zeros_like(input_ids) + torch.arange(input_ids.shape[-1], device=input_ids.device)
-        beam_indices = tuple(tuple(b) for b in beam_indices)
-        sequence_output = beam_scorer.finalize(
-            input_ids,
-            output_scores,
-            output_tokens,
-            output_indices,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-            max_length=max_length,
-            beam_indices=beam_indices,
-        )
-
-        sequences = sequence_output["sequences"]
-        sequence_scores = sequence_output["sequence_scores"]
-
-        # since `num_beam_hyps_to_keep` = 1 => only return `batch_size` x `max_length`
-        self.parent.assertListEqual(list(sequences.shape), [self.batch_size, max_length])
-        self.parent.assertListEqual(list(sequence_scores.shape), [self.batch_size])
-
-        # check sequence_scores
-        self.parent.assertFalse((sequence_scores > 0).any().item())
-
-        # first batch has to finish with eos_token
-        self.parent.assertEqual(sequences[0, -1].item(), self.eos_token_id)
-
-        # other batches cannot finish with eos token
-        self.parent.assertNotEqual(sequences[1, -1].item(), self.eos_token_id)
-        self.parent.assertNotEqual(sequences[2, -1].item(), self.eos_token_id)
-
-        # now test that if `num_beam_hyps_to_keep` is 3 => all beams are returned
-        beam_scorer.num_beam_hyps_to_keep = self.num_beams
-        sequence_output = beam_scorer.finalize(
-            input_ids,
-            output_scores,
-            output_tokens,
-            output_indices,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-            max_length=max_length,
-            beam_indices=beam_indices,
-        )
-        sequences = sequence_output["sequences"]
-        sequence_scores = sequence_output["sequence_scores"]
-
-        self.parent.assertListEqual(list(sequences.shape), [self.num_beams * self.batch_size, max_length])
-        self.parent.assertListEqual(list(sequence_scores.shape), [self.num_beams * self.batch_size])
-
-
 class ConstrainedBeamSearchTester:
     def __init__(
         self,
@@ -540,24 +322,6 @@ def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
         return flag
 
 
-@require_torch
-class BeamSearchTest(unittest.TestCase):
-    def setUp(self):
-        self.beam_search_tester = BeamSearchTester(self)
-
-    def test_beam_hypotheses(self):
-        inputs = self.beam_search_tester.prepare_inputs()
-        self.beam_search_tester.check_beam_hypotheses(*inputs)
-
-    def test_beam_scorer_update(self):
-        inputs = self.beam_search_tester.prepare_inputs()
-        self.beam_search_tester.check_beam_scorer_update(*inputs)
-
-    def test_beam_scorer_finalize(self):
-        inputs = self.beam_search_tester.prepare_inputs()
-        self.beam_search_tester.check_beam_scores_finalize(*inputs)
-
-
 @require_torch
 class ConstrainedBeamSearchTest(unittest.TestCase):
     def setUp(self):
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index 429d61cbd26a..e67e5ba325e0 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -39,7 +39,6 @@
     ForcedBOSTokenLogitsProcessor,
     ForcedEOSTokenLogitsProcessor,
     GenerationMode,
-    HammingDiversityLogitsProcessor,
     MinLengthLogitsProcessor,
     MinNewTokensLengthLogitsProcessor,
     MinPLogitsWarper,
@@ -536,31 +535,6 @@ def prefix_allowed_tokens_fn(batch_id, inputs_ids):
         )
         self.assertEqual(prefix_constrained_logits_proc._num_beams, num_beams)
 
-    def test_serialize_generation_diversity_penalty_and_num_bean_groups(self):
-        """Tests that GenerationConfig is serialized and HammingDiversityLogitsProcessor is initialized with diversity_penalty_and_num_bean_groups"""
-        num_beams = 2
-        num_beam_groups = 2
-        diversity_penalty = 1.0
-
-        generation_config = GenerationConfig(
-            num_beams=num_beams, diversity_penalty=diversity_penalty, num_beam_groups=num_beam_groups
-        )
-        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
-            generation_config.save_pretrained(tmp_dir)
-            new_config = GenerationConfig.from_pretrained(tmp_dir)
-        self.assertEqual(new_config.num_beams, num_beams)
-        self.assertEqual(new_config.diversity_penalty, diversity_penalty)
-        self.assertEqual(new_config.num_beam_groups, num_beam_groups)
-
-        diversity_logits_processor = HammingDiversityLogitsProcessor(
-            diversity_penalty=new_config.diversity_penalty,
-            num_beams=new_config.num_beams,
-            num_beam_groups=new_config.num_beam_groups,
-        )
-        self.assertEqual(diversity_logits_processor._num_beams, num_beams)
-        self.assertEqual(diversity_logits_processor._diversity_penalty, diversity_penalty)
-        self.assertEqual(diversity_logits_processor._num_sub_beams, num_beams // num_beam_groups)
-
     def test_serialize_generation_bos_token_id(self):
         """Tests that GenerationConfig is serialized and ForcedBOSTokenLogitsProcessor is initialized with bos_token_id"""
         bos_token_id = 0
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index df68f9c62100..768e216ef534 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -36,7 +36,6 @@
         ExponentialDecayLengthPenalty,
         ForcedBOSTokenLogitsProcessor,
         ForcedEOSTokenLogitsProcessor,
-        HammingDiversityLogitsProcessor,
         InfNanRemoveLogitsProcessor,
         LogitNormalization,
         LogitsProcessorList,
@@ -796,36 +795,6 @@ def empty_prefix_allowed_tokens_fn(batch_id, inputs_ids):
         # processor should not change logits in-place
         self.assertFalse(torch.all(scores == filtered_scores))
 
-    def test_hamming_diversity(self):
-        vocab_size = 4
-        num_beams = 2
-        num_beam_groups = 2
-
-        scores = self._get_uniform_logits(num_beams, vocab_size)
-        # batch_idx = 0 -> index batch_idx * num_beam_groups -> idx = 0 * 2 = 0 -> penalises tokens 1
-        # batch_idx = 1 -> index batch_idx * num_beam_groups -> idx = 1 * 2 = 2 -> penalises tokens 1
-        current_tokens = torch.tensor([0, 3, 1, 2], device=torch_device, dtype=torch.long)
-
-        diversity_logits_processor = HammingDiversityLogitsProcessor(
-            diversity_penalty=1.0, num_beams=num_beams, num_beam_groups=num_beam_groups
-        )
-
-        processed_scores = diversity_logits_processor(None, scores, current_tokens, 1)
-
-        self.assertTrue(
-            torch.allclose(
-                processed_scores[0], torch.tensor([-0.7500, 0.2500, 0.2500, 0.2500], device=torch_device), atol=1e-3
-            )
-        )
-        self.assertTrue(
-            torch.allclose(
-                processed_scores[1], torch.tensor([0.2500, -0.7500, 0.2500, 0.2500], device=torch_device), atol=1e-3
-            )
-        )
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == processed_scores))
-
     def test_forced_bos_token_logits_processor(self):
         vocab_size = 20
         batch_size = 4
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 449d8122c12b..ff7b77d6a660 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -217,6 +217,7 @@ def _get_diverse_beam_kwargs(self, num_return_sequences=1):
             "num_return_sequences": num_return_sequences,
             "num_beam_groups": 2,  # one beam per group
             "diversity_penalty": 2.0,
+            "trust_remote_code": True,
         }
         return beam_kwargs
 
@@ -2651,6 +2652,7 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
 @pytest.mark.generate
 @require_torch
 class GenerationIntegrationTests(unittest.TestCase):
+    # TODO joao, manuel: remove in v4.62.0
     @slow
     def test_diverse_beam_search(self):
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
@@ -2669,6 +2671,7 @@ def test_diverse_beam_search(self):
             num_beam_groups=4,
             diversity_penalty=2.0,
             remove_invalid_values=True,
+            trust_remote_code=True,
         )
 
         generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
@@ -2849,7 +2852,7 @@ def test_transition_scores_group_beam_search_encoder_decoder(self):
         model = model.to(torch_device)
 
         input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        outputs = model.generate(input_ids=input_ids)
+        outputs = model.generate(input_ids=input_ids, trust_remote_code=True)
 
         transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
         transition_scores_sum = transition_scores.sum(-1)
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index d6a34a361354..9ac8e462e8e3 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -434,9 +434,7 @@ def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self):
         inputs = tokenizer(input_text, return_tensors="pt").to(device)
         common = {
             "num_beams": 4,
-            "num_beam_groups": 2,
             "num_return_sequences": 4,
-            "diversity_penalty": 1.0,
             "max_new_tokens": 20,
             "early_stopping": True,
         }
diff --git a/tests/utils/test_configuration_utils.py b/tests/utils/test_configuration_utils.py
index 60e8703937cd..a3f504f7ea73 100644
--- a/tests/utils/test_configuration_utils.py
+++ b/tests/utils/test_configuration_utils.py
@@ -53,8 +53,6 @@
     "do_sample": True,
     "early_stopping": True,
     "num_beams": 3,
-    "num_beam_groups": 3,
-    "diversity_penalty": 0.5,
     "temperature": 2.0,
     "top_k": 10,
     "top_p": 0.7,

From 62cb274a4acb9f24201902242f1b0dc4e46daac1 Mon Sep 17 00:00:00 2001
From: Manuel de Prada Corral <manueldeprada@gmail.com>
Date: Wed, 27 Aug 2025 11:48:19 +0200
Subject: [PATCH 3/5] fix

---
 src/transformers/dynamic_module_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 31bba4fc83ae..0cef5308b39b 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -428,10 +428,10 @@ def get_cached_module_file(
             importlib.invalidate_caches()
         # Make sure we also have every file with relative
         for module_needed in modules_needed:
-            if not (submodule_path / f"{module_needed}.py").exists():
+            if not ((submodule_path / module_file).parent / f"{module_needed}.py").exists():
                 get_cached_module_file(
                     pretrained_model_name_or_path,
-                    f"{module_needed}.py",
+                    f"{Path(module_file).parent / module_needed}.py",
                     cache_dir=cache_dir,
                     force_download=force_download,
                     resume_download=resume_download,

From 822efd8c3cf475d079e64293aa06e4ab59740fd7 Mon Sep 17 00:00:00 2001
From: Manuel de Prada Corral <manueldeprada@gmail.com>
Date: Wed, 27 Aug 2025 15:59:51 +0200
Subject: [PATCH 4/5] aaand remove tests after all green!!

---
 src/transformers/configuration_utils.py       |   3 +
 tests/generation/test_utils.py                | 136 +++---------------
 tests/models/csm/test_modeling_csm.py         |  10 --
 tests/models/dia/test_modeling_dia.py         |   1 -
 .../test_modeling_recurrent_gemma.py          |  10 --
 tests/models/rwkv/test_modeling_rwkv.py       |   7 -
 tests/models/whisper/test_modeling_whisper.py |   6 -
 7 files changed, 24 insertions(+), 149 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 035e8ce791a6..d450193dbc2d 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -1139,6 +1139,9 @@ def _get_global_generation_defaults() -> dict[str, Any]:
             "exponential_decay_length_penalty": None,
             "suppress_tokens": None,
             "begin_suppress_tokens": None,
+            # Deprecated arguments (moved to the Hub). TODO joao, manuel: remove in v4.62.0
+            "num_beam_groups": 1,
+            "diversity_penalty": 0.0,
         }
 
     def _get_non_default_generation_parameters(self) -> dict[str, Any]:
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index ff7b77d6a660..bfb1ba4111dd 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -209,18 +209,6 @@ def _get_beam_kwargs(self, num_return_sequences=1):
         }
         return beam_kwargs
 
-    def _get_diverse_beam_kwargs(self, num_return_sequences=1):
-        beam_kwargs = {
-            "early_stopping": False,
-            "length_penalty": 2.0,
-            "num_beams": 2,
-            "num_return_sequences": num_return_sequences,
-            "num_beam_groups": 2,  # one beam per group
-            "diversity_penalty": 2.0,
-            "trust_remote_code": True,
-        }
-        return beam_kwargs
-
     def _get_constrained_beam_kwargs(self, num_return_sequences=1):
         beam_kwargs = {
             "early_stopping": False,
@@ -352,36 +340,6 @@ def _beam_sample_generate(
 
         return output_generate
 
-    def _group_beam_search_generate(
-        self,
-        model,
-        inputs_dict,
-        beam_kwargs,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        output_generate = model.generate(
-            do_sample=False,
-            max_new_tokens=self.max_new_tokens,
-            min_new_tokens=self.max_new_tokens,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **beam_kwargs,
-            **logits_processor_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
     def _constrained_beam_search_generate(
         self,
         model,
@@ -748,77 +706,6 @@ def test_generate_without_input_ids(self):
             )
             self.assertIsNotNone(output_ids_generate)
 
-    @pytest.mark.generate
-    def test_group_beam_search_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-
-            model = model_class(config).to(torch_device).eval()
-            # check `generate()` and `group_beam_search()` are equal
-            beam_kwargs = self._get_diverse_beam_kwargs()
-            output_generate = self._group_beam_search_generate(
-                model=model,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
-
-            # check `group_beam_search` for higher than 1 `num_return_sequences`
-            num_return_sequences = 2
-            beam_kwargs = self._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences)
-            output_generate = self._group_beam_search_generate(
-                model=model,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
-
-    @pytest.mark.generate
-    def test_group_beam_search_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            if self.has_attentions:
-                config._attn_implementation = "eager"  # can't output attentions otherwise
-
-            model = model_class(config).to(torch_device).eval()
-            beam_kwargs = self._get_diverse_beam_kwargs()
-            output_generate = self._group_beam_search_generate(
-                model=model,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=False,
-            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
-                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
-            else:
-                self.assertTrue(
-                    output_generate.sequences.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1]
-                )
-                self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
-
-            self._check_generate_outputs(
-                output_generate,
-                model.config,
-                num_return_sequences=beam_kwargs["num_return_sequences"],
-                num_beams=beam_kwargs["num_beams"],
-            )
-
     @is_flaky()  # Some models have position-specific tokens, this test may try to force them in an invalid position
     @pytest.mark.generate
     def test_constrained_beam_search_generate(self):
@@ -2672,6 +2559,7 @@ def test_diverse_beam_search(self):
             diversity_penalty=2.0,
             remove_invalid_values=True,
             trust_remote_code=True,
+            custom_generate="transformers-community/group-beam-search",
         )
 
         generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
@@ -2831,6 +2719,7 @@ def test_generate_input_values_as_encoder_kwarg(self):
         self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
         self.assertEqual(output_sequences.shape, (2, 5))
 
+    # TODO joao, manuel: remove in v4.62.0
     def test_transition_scores_group_beam_search_encoder_decoder(self):
         articles = [
             "Justin Timberlake and Jessica Biel, welcome to parenthood.",
@@ -2839,12 +2728,14 @@ def test_transition_scores_group_beam_search_encoder_decoder(self):
         tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
         model = BartForConditionalGeneration.from_pretrained(
             "hf-internal-testing/tiny-random-bart",
+            eos_token_id=None,
+        )
+        generation_config = GenerationConfig(
             max_length=10,
             num_beams=2,
             num_beam_groups=2,
             num_return_sequences=2,
             diversity_penalty=1.0,
-            eos_token_id=None,
             return_dict_in_generate=True,
             output_scores=True,
             length_penalty=0.0,
@@ -2852,7 +2743,12 @@ def test_transition_scores_group_beam_search_encoder_decoder(self):
         model = model.to(torch_device)
 
         input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        outputs = model.generate(input_ids=input_ids, trust_remote_code=True)
+        outputs = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            trust_remote_code=True,
+            custom_generate="transformers-community/group-beam-search",
+        )
 
         transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
         transition_scores_sum = transition_scores.sum(-1)
@@ -4823,6 +4719,16 @@ def test_generate_custom_cache_position(self):
         [
             ("transformers-community/dola", {"dola_layers": "low"}),
             ("transformers-community/contrastive-search", {"penalty_alpha": 0.6, "top_k": 4}),
+            (
+                "transformers-community/group-beam-search",
+                {
+                    "do_sample": False,
+                    "num_beams": 2,
+                    "num_beam_groups": 2,
+                    "diversity_penalty": 2.0,
+                    "length_penalty": 2.0,
+                },
+            ),
         ]
     )
     def test_hub_gen_strategies(self, custom_generate, extra_kwargs):
diff --git a/tests/models/csm/test_modeling_csm.py b/tests/models/csm/test_modeling_csm.py
index f81685abd091..d77a86a201cb 100644
--- a/tests/models/csm/test_modeling_csm.py
+++ b/tests/models/csm/test_modeling_csm.py
@@ -272,16 +272,6 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
     def test_beam_sample_generate_dict_output(self):
         pass
 
-    @pytest.mark.generate
-    @unittest.skip(reason="CSM does not support group beam search.")
-    def test_group_beam_search_generate(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="CSM does not support group beam search.")
-    def test_group_beam_search_generate_dict_output(self):
-        pass
-
     @pytest.mark.generate
     @unittest.skip(reason="CSM does not support constrained beam search.")
     def test_constrained_beam_search_generate(self):
diff --git a/tests/models/dia/test_modeling_dia.py b/tests/models/dia/test_modeling_dia.py
index 5f51649619fe..2f09b65cf8f3 100644
--- a/tests/models/dia/test_modeling_dia.py
+++ b/tests/models/dia/test_modeling_dia.py
@@ -237,7 +237,6 @@ def skip_non_greedy_generate(self):
         skippable_tests = [
             "test_sample_generate_dict_output",  # return sequences > 1
             "test_beam",
-            "test_group_beam",
             "test_constrained_beam",
             "test_contrastive",
             "test_assisted",
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index df8d2d6e508a..bcb7259004ba 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -138,16 +138,6 @@ def test_constrained_beam_search_generate_dict_output(self):
     def test_generate_without_input_ids(self):
         pass
 
-    @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests")
-    @pytest.mark.generate
-    def test_group_beam_search_generate(self):
-        pass
-
-    @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests")
-    @pytest.mark.generate
-    def test_group_beam_search_generate_dict_output(self):
-        pass
-
     @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests")
     @pytest.mark.generate
     def test_constrained_beam_search_generate(self):
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index 808cef6ddcef..8682c1e75c58 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -401,13 +401,6 @@ def test_greedy_generate_dict_outputs(self):
         super().test_greedy_generate_dict_outputs()
         self.has_attentions = old_has_attentions
 
-    def test_group_beam_search_generate_dict_output(self):
-        # This model has a custom attention output shape AND config flags, let's skip those checks
-        old_has_attentions = self.has_attentions
-        self.has_attentions = False
-        super().test_group_beam_search_generate_dict_output()
-        self.has_attentions = old_has_attentions
-
     def test_sample_generate_dict_output(self):
         # This model has a custom attention output shape AND config flags, let's skip those checks
         old_has_attentions = self.has_attentions
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index bc5b065d918b..a2dcccddb929 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -403,12 +403,6 @@ def _get_beam_kwargs(self, num_return_sequences=1):
         beam_kwargs["num_return_sequences"] = beam_kwargs["num_beams"]
         return beam_kwargs
 
-    def _get_diverse_beam_kwargs(self, num_return_sequences=1):
-        # Overwritten from `GenerationTesterMixin`, Whisper's `num_return_sequences` differs from the core `generate`
-        beam_kwargs = super()._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences)
-        beam_kwargs["num_return_sequences"] = beam_kwargs["num_beams"]
-        return beam_kwargs
-
     def _get_constrained_beam_kwargs(self, num_return_sequences=1):
         # Overwritten from `GenerationTesterMixin`, Whisper's `num_return_sequences` differs from the core `generate`
         beam_kwargs = super()._get_constrained_beam_kwargs(num_return_sequences=num_return_sequences)

From c17bf304d5cf33af7f34f9f6057915d5f5821dae Mon Sep 17 00:00:00 2001
From: Manuel de Prada Corral <manueldeprada@gmail.com>
Date: Wed, 27 Aug 2025 17:00:50 +0200
Subject: [PATCH 5/5] fix test

---
 tests/utils/test_configuration_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/utils/test_configuration_utils.py b/tests/utils/test_configuration_utils.py
index a3f504f7ea73..60e8703937cd 100644
--- a/tests/utils/test_configuration_utils.py
+++ b/tests/utils/test_configuration_utils.py
@@ -53,6 +53,8 @@
     "do_sample": True,
     "early_stopping": True,
     "num_beams": 3,
+    "num_beam_groups": 3,
+    "diversity_penalty": 0.5,
     "temperature": 2.0,
     "top_k": 10,
     "top_p": 0.7,