diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 5c7d27192292..63b70899af4d 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -225,28 +225,6 @@ outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=to
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
-### Diverse beam search
-
-[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
-
-Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
-
-device = infer_device()
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.float16).to(device)
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
-```
-
 
 ## Custom generation methods
 
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index ecd4e77fc5f7..9deb926b905f 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -108,9 +108,6 @@ generation.
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
 [[autodoc]] InfNanRemoveLogitsProcessor
     - __call__
 
@@ -219,10 +216,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens
     - process
     - finalize
 
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
 [[autodoc]] ConstrainedBeamSearchScorer
     - process
     - finalize
diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md
index f58d4a995e80..f7bcd3252493 100644
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@@ -146,7 +146,7 @@ tokenizer = AutoTokenizer.from_pretrained(ckpt)
 model = AutoModelForCausalLM.from_pretrained(ckpt, dtype=torch.float16, device_map="auto")
 prompt = ["okay "*1000 + "Fun fact: The most"]
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
+beams = { "num_beams": 40, "num_return_sequences": 20, "max_new_tokens": 23, "early_stopping": True, }
 out = resilient_generate(model, **inputs, **beams)
 responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)
 ```
diff --git a/docs/source/ja/generation_strategies.md b/docs/source/ja/generation_strategies.md
index 856c4856c52f..45eec30c0765 100644
--- a/docs/source/ja/generation_strategies.md
+++ b/docs/source/ja/generation_strategies.md
@@ -241,43 +241,6 @@ time."\n\nHe added: "I am very proud of the work I have been able to do in the l
 'Das Haus ist wunderbar.'
 ```
 
-### Diverse beam search decoding
-
-多様なビームサーチデコーディング戦略は、ビームサーチ戦略の拡張であり、選択肢からより多様なビームシーケンスを生成できるようにします。この仕組みの詳細については、[Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models](https://huggingface.co/papers/1610.02424) をご参照ください。このアプローチには、`num_beams`、`num_beam_groups`、および `diversity_penalty` という3つの主要なパラメータがあります。多様性ペナルティは、出力がグループごとに異なることを保証し、ビームサーチは各グループ内で使用されます。
-
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> checkpoint = "google/pegasus-xsum"
->>> prompt = (
-...     "The Permaculture Design Principles are a set of universal design principles "
-...     "that can be applied to any location, climate and culture, and they allow us to design "
-...     "the most efficient and sustainable human habitation and food production systems. "
-...     "Permaculture is a design system that encompasses a wide variety of disciplines, such "
-...     "as ecology, landscape design, environmental science and energy conservation, and the "
-...     "Permaculture design principles are drawn from these various disciplines. Each individual "
-...     "design principle itself embodies a complete conceptual framework based on sound "
-...     "scientific principles. When we bring all these separate  principles together, we can "
-...     "create a design system that both looks at whole systems, the parts that these systems "
-...     "consist of, and how those parts interact with each other to create a complex, dynamic, "
-...     "living system. Each design principle serves as a tool that allows us to integrate all "
-...     "the separate parts of a design, referred to as elements, into a functional, synergistic, "
-...     "whole system, where the elements harmoniously interact and work together in the most "
-...     "efficient way possible."
-... )
-
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-
->>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0)
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'The Design Principles are a set of universal design principles that can be applied to any location, climate and
-culture, and they allow us to design the'
-```
-
 ### Assisted Decoding
 
 アシストデコーディングは、上記のデコーディング戦略を変更したもので、同じトークナイザー（理想的にははるかに小さなモデル）を使用して、いくつかの候補トークンを貪欲に生成するアシスタントモデルを使用します。その後、主要なモデルは候補トークンを1つの前向きパスで検証し、デコーディングプロセスを高速化します。現在、アシストデコーディングでは貪欲検索とサンプリングのみがサポートされており、バッチ入力はサポートされていません。アシストデコーディングの詳細については、[このブログ記事](https://huggingface.co/blog/assisted-generation) をご覧ください。
diff --git a/docs/source/ja/internal/generation_utils.md b/docs/source/ja/internal/generation_utils.md
index 1a5cc1dec079..c01d86f54bc0 100644
--- a/docs/source/ja/internal/generation_utils.md
+++ b/docs/source/ja/internal/generation_utils.md
@@ -139,9 +139,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
 [[autodoc]] InfNanRemoveLogitsProcessor
     - __call__
 
@@ -321,10 +318,6 @@ generation_output[:2]
     - process
     - finalize
 
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
 [[autodoc]] ConstrainedBeamSearchScorer
     - process
     - finalize
diff --git a/docs/source/ko/generation_strategies.md b/docs/source/ko/generation_strategies.md
index da38e4f418f2..c59eff4111f3 100644
--- a/docs/source/ko/generation_strategies.md
+++ b/docs/source/ko/generation_strategies.md
@@ -232,44 +232,6 @@ time."\n\nHe added: "I am very proud of the work I have been able to do in the l
 'Das Haus ist wunderbar.'
 ```
 
-### 다양한 빔 탐색 디코딩(Diverse beam search decoding)[[diverse-beam-search-decoding]]
-
-다양한 빔 탐색(Decoding) 전략은 선택할 수 있는 더 다양한 빔 시퀀스 집합을 생성할 수 있게 해주는 빔 탐색 전략의 확장입니다. 이 방법은 어떻게 작동하는지 알아보려면, [다양한 빔 탐색: 신경 시퀀스 모델에서 다양한 솔루션 디코딩하기](https://huggingface.co/papers/1610.02424)를 참조하세요. 이 접근 방식은 세 가지 주요 매개변수를 가지고 있습니다: `num_beams`, `num_beam_groups`, 그리고 `diversity_penalty`. 다양성 패널티는 그룹 간에 출력이 서로 다르게 하기 위한 것이며, 각 그룹 내에서 빔 탐색이 사용됩니다.
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
->>> checkpoint = "google/pegasus-xsum"
->>> prompt = (
-...     "The Permaculture Design Principles are a set of universal design principles "
-...     "that can be applied to any location, climate and culture, and they allow us to design "
-...     "the most efficient and sustainable human habitation and food production systems. "
-...     "Permaculture is a design system that encompasses a wide variety of disciplines, such "
-...     "as ecology, landscape design, environmental science and energy conservation, and the "
-...     "Permaculture design principles are drawn from these various disciplines. Each individual "
-...     "design principle itself embodies a complete conceptual framework based on sound "
-...     "scientific principles. When we bring all these separate  principles together, we can "
-...     "create a design system that both looks at whole systems, the parts that these systems "
-...     "consist of, and how those parts interact with each other to create a complex, dynamic, "
-...     "living system. Each design principle serves as a tool that allows us to integrate all "
-...     "the separate parts of a design, referred to as elements, into a functional, synergistic, "
-...     "whole system, where the elements harmoniously interact and work together in the most "
-...     "efficient way possible."
-... )
-
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-
->>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0)
->>> tokenizer.decode(outputs[0], skip_special_tokens=True)
-'The Design Principles are a set of universal design principles that can be applied to any location, climate and
-culture, and they allow us to design the'
-```
-
-이 가이드에서는 다양한 디코딩 전략을 가능하게 하는 주요 매개변수를 보여줍니다. [`generate`] 메서드에 대한 고급 매개변수가 존재하므로 [`generate`] 메서드의 동작을 더욱 세부적으로 제어할 수 있습니다. 사용 가능한 매개변수의 전체 목록은 [API 문서](./main_classes/text_generation)를 참조하세요.
-
 ### 추론 디코딩(Speculative Decoding)[[speculative-decoding]]
 
 추론 디코딩(보조 디코딩(assisted decoding)으로도 알려짐)은 동일한 토크나이저를 사용하는 훨씬 작은 보조 모델을 활용하여 몇 가지 후보 토큰을 생성하는 상위 모델의 디코딩 전략을 수정한 것입니다. 주 모델은 단일 전방 통과로 후보 토큰을 검증함으로써 디코딩 과정을 가속화합니다. `do_sample=True`일 경우, [추론 디코딩 논문](https://huggingface.co/papers/2211.17192)에 소개된 토큰 검증과 재샘플링 방식이 사용됩니다.
diff --git a/docs/source/ko/internal/generation_utils.md b/docs/source/ko/internal/generation_utils.md
index bf567920610c..9bd669e34d2b 100644
--- a/docs/source/ko/internal/generation_utils.md
+++ b/docs/source/ko/internal/generation_utils.md
@@ -131,9 +131,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
 [[autodoc]] InfNanRemoveLogitsProcessor
     - __call__
 
@@ -326,10 +323,6 @@ generation_output[:2]
     - process
     - finalize
 
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
 [[autodoc]] ConstrainedBeamSearchScorer
     - process
     - finalize
diff --git a/docs/source/zh/internal/generation_utils.md b/docs/source/zh/internal/generation_utils.md
index 084e2a29dc8c..b33ac4be9c92 100644
--- a/docs/source/zh/internal/generation_utils.md
+++ b/docs/source/zh/internal/generation_utils.md
@@ -133,9 +133,6 @@ generation_output[:2]
 [[autodoc]] ForcedEOSTokenLogitsProcessor
     - __call__
 
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
 [[autodoc]] InfNanRemoveLogitsProcessor
     - __call__
 
@@ -316,10 +313,6 @@ generation_output[:2]
     - process
     - finalize
 
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
 [[autodoc]] ConstrainedBeamSearchScorer
     - process
     - finalize
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3349a1698eb8..d0b804beea1c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -411,7 +411,6 @@
             "BayesianDetectorConfig",
             "BayesianDetectorModel",
             "BeamScorer",
-            "BeamSearchScorer",
             "ClassifierFreeGuidanceLogitsProcessor",
             "ConstrainedBeamSearchScorer",
             "Constraint",
@@ -426,7 +425,6 @@
             "ForcedBOSTokenLogitsProcessor",
             "ForcedEOSTokenLogitsProcessor",
             "GenerationMixin",
-            "HammingDiversityLogitsProcessor",
             "InfNanRemoveLogitsProcessor",
             "LogitNormalization",
             "LogitsProcessor",
@@ -656,7 +654,6 @@
     from .generation import BayesianDetectorConfig as BayesianDetectorConfig
     from .generation import BayesianDetectorModel as BayesianDetectorModel
     from .generation import BeamScorer as BeamScorer
-    from .generation import BeamSearchScorer as BeamSearchScorer
     from .generation import ClassifierFreeGuidanceLogitsProcessor as ClassifierFreeGuidanceLogitsProcessor
     from .generation import CompileConfig as CompileConfig
     from .generation import ConstrainedBeamSearchScorer as ConstrainedBeamSearchScorer
@@ -687,7 +684,6 @@
     from .generation import ForcedEOSTokenLogitsProcessor as ForcedEOSTokenLogitsProcessor
     from .generation import GenerationConfig as GenerationConfig
     from .generation import GenerationMixin as GenerationMixin
-    from .generation import HammingDiversityLogitsProcessor as HammingDiversityLogitsProcessor
     from .generation import InfNanRemoveLogitsProcessor as InfNanRemoveLogitsProcessor
     from .generation import LogitNormalization as LogitNormalization
     from .generation import LogitsProcessor as LogitsProcessor
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index a290fcfc733b..d450193dbc2d 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -1121,8 +1121,6 @@ def _get_global_generation_defaults() -> dict[str, Any]:
             "do_sample": False,
             "early_stopping": False,
             "num_beams": 1,
-            "num_beam_groups": 1,
-            "diversity_penalty": 0.0,
             "temperature": 1.0,
             "top_k": 50,
             "top_p": 1.0,
@@ -1141,6 +1139,9 @@ def _get_global_generation_defaults() -> dict[str, Any]:
             "exponential_decay_length_penalty": None,
             "suppress_tokens": None,
             "begin_suppress_tokens": None,
+            # Deprecated arguments (moved to the Hub). TODO joao, manuel: remove in v4.62.0
+            "num_beam_groups": 1,
+            "diversity_penalty": 0.0,
         }
 
     def _get_non_default_generation_parameters(self) -> dict[str, Any]:
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index 31bba4fc83ae..0cef5308b39b 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -428,10 +428,10 @@ def get_cached_module_file(
             importlib.invalidate_caches()
         # Make sure we also have every file with relative
         for module_needed in modules_needed:
-            if not (submodule_path / f"{module_needed}.py").exists():
+            if not ((submodule_path / module_file).parent / f"{module_needed}.py").exists():
                 get_cached_module_file(
                     pretrained_model_name_or_path,
-                    f"{module_needed}.py",
+                    f"{Path(module_file).parent / module_needed}.py",
                     cache_dir=cache_dir,
                     force_download=force_download,
                     resume_download=resume_download,
diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py
index 64ebfe6fc7c3..4fb3d32213f8 100644
--- a/src/transformers/generation/__init__.py
+++ b/src/transformers/generation/__init__.py
@@ -44,7 +44,6 @@
     _import_structure["beam_search"] = [
         "BeamHypotheses",
         "BeamScorer",
-        "BeamSearchScorer",
         "ConstrainedBeamSearchScorer",
     ]
     _import_structure["candidate_generator"] = [
@@ -63,7 +62,6 @@
         "ExponentialDecayLengthPenalty",
         "ForcedBOSTokenLogitsProcessor",
         "ForcedEOSTokenLogitsProcessor",
-        "HammingDiversityLogitsProcessor",
         "InfNanRemoveLogitsProcessor",
         "LogitNormalization",
         "LogitsProcessor",
@@ -209,7 +207,7 @@
         pass
     else:
         from .beam_constraints import Constraint, ConstraintListState, DisjunctiveConstraint, PhrasalConstraint
-        from .beam_search import BeamHypotheses, BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
+        from .beam_search import BeamHypotheses, BeamScorer, ConstrainedBeamSearchScorer
         from .candidate_generator import (
             AssistedCandidateGenerator,
             CandidateGenerator,
@@ -227,7 +225,6 @@
             ExponentialDecayLengthPenalty,
             ForcedBOSTokenLogitsProcessor,
             ForcedEOSTokenLogitsProcessor,
-            HammingDiversityLogitsProcessor,
             InfNanRemoveLogitsProcessor,
             LogitNormalization,
             LogitsProcessor,
diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index b6647760b790..08af5755e3d7 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -45,8 +45,6 @@
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
         beam_indices (`torch.LongTensor`, *optional*):
             Beam indices indicating to which beam hypothesis each token correspond.
-        group_index (`int`, *optional*):
-            The index of the group of beams. Used with [`~PreTrainedModel.group_beam_search`].
 
     Return:
         `UserDict`: A dictionary composed of the fields as defined above:
@@ -120,302 +118,6 @@ def finalize(
         raise NotImplementedError("This is an abstract method.")
 
 
-class BeamSearchScorer(BeamScorer):
-    r"""
-    [`BeamScorer`] implementing standard beam search decoding.
-
-    Adapted in part from [Facebook's XLM beam search
-    code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
-
-    Reference for the diverse beam search algorithm and implementation [Ashwin Kalyan's DBS
-    implementation](https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua)
-
-    Args:
-        batch_size (`int`):
-            Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
-        num_beams (`int`):
-            Number of beams for beam search.
-        device (`torch.device`):
-            Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be
-            allocated.
-        length_penalty (`float`, *optional*, defaults to 1.0):
-            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
-            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
-            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
-            `length_penalty` < 0.0 encourages shorter sequences.
-        do_early_stopping (`bool` or `str`, *optional*, defaults to `False`):
-            Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values:
-            `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an
-            heuristic is applied and the generation stops when is it very unlikely to find better candidates;
-            `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical
-            beam search algorithm).
-        num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
-            The number of beam hypotheses that shall be returned upon calling
-            [`~transformers.BeamSearchScorer.finalize`].
-        num_beam_groups (`int`, *optional*, defaults to 1):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            See [this paper](https://huggingface.co/papers/1610.02424) for more details.
-        max_length (`int`, *optional*):
-            The maximum length of the sequence to be generated.
-    """
-
-    def __init__(
-        self,
-        batch_size: int,
-        num_beams: int,
-        device: torch.device,
-        length_penalty: Optional[float] = 1.0,
-        do_early_stopping: Optional[Union[bool, str]] = False,
-        num_beam_hyps_to_keep: Optional[int] = 1,
-        num_beam_groups: Optional[int] = 1,
-        max_length: Optional[int] = None,
-    ):
-        self.num_beams = num_beams
-        self.device = device
-        self.length_penalty = length_penalty
-        self.do_early_stopping = do_early_stopping
-        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-        self.num_beam_groups = num_beam_groups
-        self.group_size = self.num_beams // self.num_beam_groups
-
-        self._is_init = False
-        # self._beam_hyps[i*self.num_beam_groups+j] is the beam_hyps of the j-th group in the i-th mini-batch.
-        # If group_beam_search is not used, the list consists of `batch_size` beam_hyps.
-        self._beam_hyps = [
-            BeamHypotheses(
-                num_beams=self.group_size,
-                length_penalty=self.length_penalty,
-                early_stopping=self.do_early_stopping,
-                max_length=max_length,
-            )
-            for _ in range(batch_size * self.num_beam_groups)
-        ]
-        # self._done[i*self.num_beam_groups+j] indicates whether the generation of the beam_hyps of the j-th group
-        # in the i-th mini-batch is complete.
-        self._done = torch.tensor(
-            [False for _ in range(batch_size * self.num_beam_groups)], dtype=torch.bool, device=self.device
-        )
-
-        if not isinstance(num_beams, int) or num_beams <= 1:
-            raise ValueError(
-                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1,"
-                " one should make use of `greedy_search` instead."
-            )
-
-        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
-            raise ValueError(
-                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be"
-                f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
-            )
-
-    @property
-    def is_done(self) -> bool:
-        return self._done.all()
-
-    def process(
-        self,
-        input_ids: torch.LongTensor,
-        next_scores: torch.FloatTensor,
-        next_tokens: torch.LongTensor,
-        next_indices: torch.LongTensor,
-        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
-        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
-        beam_indices: Optional[torch.LongTensor] = None,
-        group_index: Optional[int] = 0,
-        decoder_prompt_len: Optional[int] = 0,
-    ) -> dict[str, torch.Tensor]:
-        # add up to the length which the next_scores is calculated on (including decoder prompt)
-        cur_len = input_ids.shape[-1] + 1
-        batch_size = len(self._beam_hyps) // self.num_beam_groups
-
-        if batch_size != (input_ids.shape[0] // self.group_size):
-            if self.num_beam_groups > 1:
-                raise ValueError(
-                    f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam "
-                    f"size of {self.group_size} is expected by the beam scorer."
-                )
-            else:
-                raise ValueError(
-                    f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
-                    f"{self.group_size} is expected by the beam scorer."
-                )
-
-        device = input_ids.device
-        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
-        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
-        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
-
-        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
-            if isinstance(eos_token_id, int):
-                eos_token_id = [eos_token_id]
-            eos_token_id = torch.tensor(eos_token_id)
-
-        for batch_idx in range(batch_size):
-            batch_group_idx = batch_idx * self.num_beam_groups + group_index
-            if self._done[batch_group_idx]:
-                if self.num_beams < len(self._beam_hyps[batch_group_idx]):
-                    raise ValueError(f"Batch can only be done if at least {self.num_beams} beams have been generated")
-                if eos_token_id is None or pad_token_id is None:
-                    raise ValueError("Generated beams >= num_beams -> eos_token_id and pad_token have to be defined")
-                # pad the batch
-                next_beam_scores[batch_idx, :] = 0
-                next_beam_tokens[batch_idx, :] = pad_token_id
-                next_beam_indices[batch_idx, :] = 0
-                continue
-
-            # next tokens for this sentence
-            beam_idx = 0
-            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
-                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
-            ):
-                batch_beam_idx = batch_idx * self.group_size + next_index
-                # add to generated hypotheses if end of sentence
-                if (eos_token_id is not None) and (next_token.item() in eos_token_id):
-                    # if beam_token does not belong to top num_beams tokens, it should not be added
-                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
-                    if is_beam_token_worse_than_top_num_beams:
-                        continue
-                    if beam_indices is not None:
-                        beam_index = beam_indices[batch_beam_idx]
-                        beam_index = beam_index + (batch_beam_idx,)
-                    else:
-                        beam_index = None
-
-                    self._beam_hyps[batch_group_idx].add(
-                        input_ids[batch_beam_idx].clone(),
-                        next_score.item(),
-                        beam_indices=beam_index,
-                        generated_len=cur_len - decoder_prompt_len,
-                    )
-                else:
-                    # add next predicted token since it is not eos_token
-                    next_beam_scores[batch_idx, beam_idx] = next_score
-                    next_beam_tokens[batch_idx, beam_idx] = next_token
-                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
-                    beam_idx += 1
-
-                # once the beam for next step is full, don't add more tokens to it.
-                if beam_idx == self.group_size:
-                    break
-
-            if beam_idx < self.group_size:
-                raise ValueError(
-                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
-                    f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
-                )
-
-            # Check if we are done so that we can save a pad step if all(done)
-            self._done[batch_group_idx] = self._done[batch_group_idx] or self._beam_hyps[batch_group_idx].is_done(
-                next_scores[batch_idx].max().item(), cur_len, decoder_prompt_len
-            )
-
-        return UserDict(
-            {
-                "next_beam_scores": next_beam_scores.view(-1),
-                "next_beam_tokens": next_beam_tokens.view(-1),
-                "next_beam_indices": next_beam_indices.view(-1),
-            }
-        )
-
-    def finalize(
-        self,
-        input_ids: torch.LongTensor,
-        final_beam_scores: torch.FloatTensor,
-        final_beam_tokens: torch.LongTensor,
-        final_beam_indices: torch.LongTensor,
-        max_length: int,
-        pad_token_id: Optional[Union[int, torch.Tensor]] = None,
-        eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None,
-        beam_indices: Optional[torch.LongTensor] = None,
-        decoder_prompt_len: Optional[int] = 0,
-    ) -> tuple[torch.LongTensor]:
-        batch_size = len(self._beam_hyps) // self.num_beam_groups
-
-        if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
-            if isinstance(eos_token_id, int):
-                eos_token_id = [eos_token_id]
-            eos_token_id = torch.tensor(eos_token_id)
-
-        # finalize all open beam hypotheses and add to generated hypotheses
-        for batch_group_idx, beam_hyp in enumerate(self._beam_hyps):
-            if self._done[batch_group_idx]:
-                continue
-
-            # all open beam hypotheses are added to the beam hypothesis
-            # beam hypothesis class automatically keeps the best beams
-            for index_per_group in range(self.group_size):
-                batch_beam_idx = batch_group_idx * self.group_size + index_per_group
-                final_score = final_beam_scores[batch_beam_idx].item()
-                final_tokens = input_ids[batch_beam_idx]
-                beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
-                generated_len = final_tokens.shape[-1] - decoder_prompt_len
-                beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
-
-        # select the best hypotheses
-        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
-        best = []
-        best_indices = []
-        best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
-
-        # retrieve best hypotheses
-        for i in range(batch_size):
-            beam_hyps_in_batch = self._beam_hyps[i * self.num_beam_groups : (i + 1) * self.num_beam_groups]
-            candidate_beams = [beam for beam_hyp in beam_hyps_in_batch for beam in beam_hyp.beams]
-            sorted_hyps = sorted(candidate_beams, key=lambda x: x[0])
-            for j in range(self.num_beam_hyps_to_keep):
-                best_hyp_tuple = sorted_hyps.pop()
-                best_score = best_hyp_tuple[0]
-                best_hyp = best_hyp_tuple[1]
-                best_index = best_hyp_tuple[2]
-                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
-
-                # append hyp to lists
-                best.append(best_hyp)
-
-                # append indices to list
-                best_indices.append(best_index)
-
-                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
-
-        # prepare for adding eos
-        sent_lengths_max = sent_lengths.max().item() + 1
-        sent_max_len = min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
-        decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
-
-        if len(best_indices) > 0 and best_indices[0] is not None:
-            indices: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
-        else:
-            indices = None
-
-        # shorter batches are padded if needed
-        if sent_lengths.min().item() != sent_lengths.max().item():
-            if pad_token_id is None:
-                raise ValueError("`pad_token_id` has to be defined")
-            decoded.fill_(pad_token_id)
-
-        if indices is not None:
-            indices.fill_(-1)
-
-        # fill with hypotheses and eos_token_id if the latter fits in
-        for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
-            decoded[i, : sent_lengths[i]] = hypo
-
-            if indices is not None:
-                indices[i, : len(best_idx)] = torch.tensor(best_idx)
-
-            if sent_lengths[i] < sent_max_len:
-                # inserting only the first eos_token_id
-                decoded[i, sent_lengths[i]] = eos_token_id[0]
-
-        return UserDict(
-            {
-                "sequences": decoded,
-                "sequence_scores": best_scores,
-                "beam_indices": indices,
-            }
-        )
-
-
 class ConstrainedBeamSearchScorer(BeamScorer):
     r"""
     [`BeamScorer`] implementing constrained beam search decoding.
@@ -446,9 +148,6 @@ class ConstrainedBeamSearchScorer(BeamScorer):
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
             The number of beam hypotheses that shall be returned upon calling
             [`~transformers.BeamSearchScorer.finalize`].
-        num_beam_groups (`int`, *optional*, defaults to 1):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            See [this paper](https://huggingface.co/papers/1610.02424) for more details.
         max_length (`int`, *optional*):
             The maximum length of the sequence to be generated.
     """
@@ -462,7 +161,6 @@ def __init__(
         length_penalty: Optional[float] = 1.0,
         do_early_stopping: Optional[Union[bool, str]] = False,
         num_beam_hyps_to_keep: Optional[int] = 1,
-        num_beam_groups: Optional[int] = 1,
         max_length: Optional[int] = None,
     ):
         self.num_beams = num_beams
@@ -470,8 +168,6 @@ def __init__(
         self.length_penalty = length_penalty
         self.do_early_stopping = do_early_stopping
         self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-        self.num_beam_groups = num_beam_groups
-        self.group_size = self.num_beams // self.num_beam_groups
         self.constraints = constraints
 
         self._is_init = False
@@ -492,12 +188,6 @@ def __init__(
                 " one should make use of `greedy_search` instead."
             )
 
-        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
-            raise ValueError(
-                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be"
-                f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
-            )
-
     @property
     def is_done(self) -> bool:
         return self._done.all()
@@ -564,23 +254,12 @@ def process(
         # add up to the length which the next_scores is calculated on (including decoder prompt)
         cur_len = input_ids.shape[-1] + 1
         batch_size = len(self._beam_hyps)
-        if batch_size != (input_ids.shape[0] // self.group_size):
-            if self.num_beam_groups > 1:
-                raise ValueError(
-                    f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam "
-                    f"size of {self.group_size} is expected by the beam scorer."
-                )
-            else:
-                raise ValueError(
-                    f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
-                    f"{self.group_size} is expected by the beam scorer."
-                )
 
         device = input_ids.device
 
-        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
-        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
-        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
+        next_beam_scores = torch.zeros((batch_size, self.num_beams), dtype=next_scores.dtype, device=device)
+        next_beam_tokens = torch.zeros((batch_size, self.num_beams), dtype=next_tokens.dtype, device=device)
+        next_beam_indices = torch.zeros((batch_size, self.num_beams), dtype=next_indices.dtype, device=device)
 
         if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor):
             if isinstance(eos_token_id, int):
@@ -604,11 +283,11 @@ def process(
             for beam_token_rank, (next_token, next_score, next_index) in enumerate(
                 zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
             ):
-                batch_beam_idx = batch_idx * self.group_size + next_index
+                batch_beam_idx = batch_idx * self.num_beams + next_index
                 # add to generated hypotheses if end of sentence
                 if (eos_token_id is not None) and (next_token.item() in eos_token_id):
                     # if beam_token does not belong to top num_beams tokens, it should not be added
-                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.num_beams
                     if is_beam_token_worse_than_top_num_beams:
                         continue
 
@@ -634,7 +313,7 @@ def process(
                     beam_idx += 1
 
                 # once the beam for next step is full, don't add more tokens to it.
-                if beam_idx == self.group_size:
+                if beam_idx == self.num_beams:
                     break
 
             new_scores, new_tokens, new_indices = self.step_sentence_constraint(
@@ -650,9 +329,9 @@ def process(
             next_beam_tokens[batch_idx] = new_tokens
             next_beam_indices[batch_idx] = new_indices
 
-            if beam_idx < self.group_size:
+            if beam_idx < self.num_beams:
                 raise ValueError(
-                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
+                    f"At most {self.num_beams} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:"
                     f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
                 )
 
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 1edaf19948e8..177fa8064857 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -89,7 +89,6 @@ class GenerationConfig(PushToHubMixin):
         - *multinomial sampling* if `num_beams=1` and `do_sample=True`
         - *beam-search decoding* if `num_beams>1` and `do_sample=False`
         - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
-        - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
         - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
         - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
 
@@ -134,9 +133,6 @@ class GenerationConfig(PushToHubMixin):
             Whether or not to use sampling ; use greedy decoding otherwise.
         num_beams (`int`, *optional*, defaults to 1):
             Number of beams for beam search. 1 means no beam search.
-        num_beam_groups (`int`, *optional*, defaults to 1):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            [this paper](https://huggingface.co/papers/1610.02424) for more details.
 
         > Parameters that control the cache
 
@@ -190,9 +186,6 @@ class GenerationConfig(PushToHubMixin):
             probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3,
             depending on the size of the model. See [Truncation Sampling as Language Model
             Desmoothing](https://huggingface.co/papers/2210.15191) for more details.
-        diversity_penalty (`float`, *optional*, defaults to 0.0):
-            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
-            particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
         repetition_penalty (`float`, *optional*, defaults to 1.0):
             The parameter for repetition penalty. 1.0 means no penalty. See [this
             paper](https://huggingface.co/papers/1909.05858) for more details.
@@ -359,7 +352,6 @@ def __init__(self, **kwargs):
         # Parameters that control the generation strategy used
         self.do_sample = kwargs.pop("do_sample", False)
         self.num_beams = kwargs.pop("num_beams", 1)
-        self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
 
         # Parameters that control the cache
         self.use_cache = kwargs.pop("use_cache", True)
@@ -377,7 +369,6 @@ def __init__(self, **kwargs):
         self.typical_p = kwargs.pop("typical_p", 1.0)
         self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", 0.0)
         self.eta_cutoff = kwargs.pop("eta_cutoff", 0.0)
-        self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
         self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
         self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", 1.0)
         self.length_penalty = kwargs.pop("length_penalty", 1.0)
@@ -441,6 +432,8 @@ def __init__(self, **kwargs):
         self.low_memory = kwargs.pop("low_memory", None)
         self.penalty_alpha = kwargs.pop("penalty_alpha", None)
         self.dola_layers = kwargs.pop("dola_layers", None)
+        self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
+        self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
 
         # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub
         # interface.
@@ -628,14 +621,6 @@ def validate(self, strict=False):
                 minor_issues["early_stopping"] = single_beam_wrong_parameter_msg.format(
                     flag_name="early_stopping", flag_value=self.early_stopping
                 )
-            if self.num_beam_groups is not None and self.num_beam_groups != 1:
-                minor_issues["num_beam_groups"] = single_beam_wrong_parameter_msg.format(
-                    flag_name="num_beam_groups", flag_value=self.num_beam_groups
-                )
-            if self.diversity_penalty is not None and self.diversity_penalty != 0.0:
-                minor_issues["diversity_penalty"] = single_beam_wrong_parameter_msg.format(
-                    flag_name="diversity_penalty", flag_value=self.diversity_penalty
-                )
             if self.length_penalty is not None and self.length_penalty != 1.0:
                 minor_issues["length_penalty"] = single_beam_wrong_parameter_msg.format(
                     flag_name="length_penalty", flag_value=self.length_penalty
@@ -658,27 +643,6 @@ def validate(self, strict=False):
                     raise ValueError(
                         constrained_wrong_parameter_msg.format(flag_name="do_sample", flag_value=self.do_sample)
                     )
-                if self.num_beam_groups is not None and self.num_beam_groups != 1:
-                    raise ValueError(
-                        constrained_wrong_parameter_msg.format(
-                            flag_name="num_beam_groups", flag_value=self.num_beam_groups
-                        )
-                    )
-            # group beam search
-            elif self.diversity_penalty != 0.0 or self.num_beam_groups != 1:
-                group_error_prefix = (
-                    "`diversity_penalty` is not 0.0 or `num_beam_groups` is not 1, triggering group beam search. In "
-                    "this generation mode, "
-                )
-                if self.do_sample is True:
-                    raise ValueError(group_error_prefix + "`do_sample` must be set to `False`")
-                if self.num_beams % self.num_beam_groups != 0:
-                    raise ValueError(group_error_prefix + "`num_beams` should be divisible by `num_beam_groups`")
-                if self.diversity_penalty == 0.0:
-                    raise ValueError(
-                        group_error_prefix
-                        + "`diversity_penalty` should be greater than `0.0`, otherwise your groups will be identical."
-                    )
 
         # 2.4. check `num_return_sequences`
         if self.num_return_sequences != 1:
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 14b4b54aa1c5..abc08ef2eb5c 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -1441,142 +1441,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         return scores_processed
 
 
-class HammingDiversityLogitsProcessor(LogitsProcessor):
-    r"""
-    [`LogitsProcessor`] that enforces diverse beam search.
-
-    Note that this logits processor is only effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam
-    Search: Decoding Diverse Solutions from Neural Sequence Models](https://huggingface.co/papers/1610.02424) for more
-    details.
-
-    Traditional beam search often generates very similar sequences across different beams.
-    `HammingDiversityLogitsProcessor` addresses this by penalizing beams that generate tokens already chosen by other
-    beams in the same time step.
-
-    Args:
-        diversity_penalty (`float`):
-            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
-            particular time. A higher `diversity_penalty` will enforce greater diversity among the beams. Adjusting
-            this value can help strike a balance between diversity and natural likelihood.
-        num_beams (`int`):
-            Number of beams for beam search. 1 means no beam search.
-        num_beam_groups (`int`):
-            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            [this paper](https://huggingface.co/papers/1610.02424) for more details.
-
-    Examples:
-
-    ```python
-    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-    >>> import torch
-
-    >>> # Initialize the model and tokenizer
-    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
-    >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
-
-    >>> # A long text about the solar system
-    >>> text = (
-    ...     "The Solar System is a gravitationally bound system comprising the Sun and the objects that orbit it, "
-    ...     "either directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight "
-    ...     "planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System "
-    ...     "bodies. The Solar System formed 4.6 billion years ago from the gravitational collapse of a giant "
-    ...     "interstellar molecular cloud."
-    ... )
-    >>> inputs = tokenizer("summarize: " + text, return_tensors="pt")
-
-    >>> # Generate diverse summary
-    >>> outputs_diverse = model.generate(
-    ...     **inputs,
-    ...     num_beam_groups=2,
-    ...     diversity_penalty=10.0,
-    ...     max_length=100,
-    ...     num_beams=4,
-    ...     num_return_sequences=2,
-    ... )
-    >>> summaries_diverse = tokenizer.batch_decode(outputs_diverse, skip_special_tokens=True)
-
-    >>> # Generate non-diverse summary
-    >>> outputs_non_diverse = model.generate(
-    ...     **inputs,
-    ...     max_length=100,
-    ...     num_beams=4,
-    ...     num_return_sequences=2,
-    ... )
-    >>> summary_non_diverse = tokenizer.batch_decode(outputs_non_diverse, skip_special_tokens=True)
-
-    >>> # With `diversity_penalty`, the resulting beams are much more diverse
-    >>> print(summary_non_diverse)
-    ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.',
-    'the Solar System formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.']
-
-    >>> print(summaries_diverse)
-    ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.',
-    'the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets. the rest of the objects are smaller objects, such as the five dwarf planets and small solar system bodies.']
-    ```
-    """
-
-    def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
-        if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0):
-            raise ValueError("`diversity_penalty` should be a float strictly larger than 0.")
-        self._diversity_penalty = diversity_penalty
-        if not isinstance(num_beams, int) or num_beams < 2:
-            raise ValueError("`num_beams` should be an integer strictly larger than 1.")
-        self._num_beams = num_beams
-        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
-            raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.")
-        if num_beam_groups > num_beams:
-            raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.")
-        self._num_sub_beams = num_beams // num_beam_groups
-
-    def __call__(
-        self,
-        input_ids: torch.LongTensor,
-        scores: torch.FloatTensor,
-        current_tokens: torch.LongTensor,
-        beam_group_idx: int,
-    ) -> torch.FloatTensor:
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
-            scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
-                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
-                beam search or log softmax for each vocabulary token when using beam search
-            current_tokens (`torch.LongTensor` of shape `(batch_size)`):
-                Indices of input sequence tokens in the vocabulary, corresponding to the tokens selected by the other
-                beam groups in the current generation step.
-            beam_group_idx (`int`):
-                The index of the beam group currently being processed.
-
-        Return:
-            `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`:
-                The processed prediction scores.
-        """
-        # hamming diversity: penalise using same token in current group which was used in previous groups at
-        # the same time step
-        batch_size = current_tokens.shape[0] // self._num_beams
-        group_start_idx = beam_group_idx * self._num_sub_beams
-        group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams)
-        group_size = group_end_idx - group_start_idx
-        vocab_size = scores.shape[-1]
-
-        if group_start_idx == 0:
-            return scores
-
-        scores_processed = scores.clone()
-        for batch_idx in range(batch_size):
-            # predicted tokens of last time step of previous groups
-            previous_group_tokens = current_tokens[
-                batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
-            ]
-            token_frequency = torch.bincount(previous_group_tokens, minlength=vocab_size).to(scores.device)
-            scores_processed[batch_idx * group_size : (batch_idx + 1) * group_size] -= (
-                self._diversity_penalty * token_frequency
-            )
-
-        return scores_processed
-
-
 class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
     r"""
     [`LogitsProcessor`] that enforces the specified token as the first generated token. Used with encoder-decoder
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index e03ad600deb3..68db1406d67d 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -54,7 +54,7 @@
     logging,
 )
 from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
-from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
+from .beam_search import ConstrainedBeamSearchScorer
 from .candidate_generator import (
     AssistantVocabTranslatorCache,
     AssistedCandidateGenerator,
@@ -82,7 +82,6 @@
     ExponentialDecayLengthPenalty,
     ForcedBOSTokenLogitsProcessor,
     ForcedEOSTokenLogitsProcessor,
-    HammingDiversityLogitsProcessor,
     InfNanRemoveLogitsProcessor,
     LogitNormalization,
     LogitsProcessorList,
@@ -371,7 +370,6 @@ class GenerationMixin(ContinuousMixin):
         - *multinomial sampling* if `num_beams=1` and `do_sample=True`
         - *beam-search decoding* if `num_beams>1` and `do_sample=False`
         - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
-        - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1`
         - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None`
         - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`
 
@@ -1114,14 +1112,6 @@ def _get_logits_processor(
         if generation_config.sequence_bias is not None:
             processors.append(SequenceBiasLogitsProcessor(sequence_bias=generation_config.sequence_bias))
 
-        if generation_config.diversity_penalty is not None and generation_config.diversity_penalty > 0.0:
-            processors.append(
-                HammingDiversityLogitsProcessor(
-                    diversity_penalty=generation_config.diversity_penalty,
-                    num_beams=generation_config.num_beams,
-                    num_beam_groups=generation_config.num_beam_groups,
-                )
-            )
         if (
             generation_config.encoder_repetition_penalty is not None
             and generation_config.encoder_repetition_penalty != 1.0
@@ -1196,7 +1186,7 @@ def _get_logits_processor(
             processors.append(
                 PrefixConstrainedLogitsProcessor(
                     prefix_allowed_tokens_fn,
-                    generation_config.num_beams // generation_config.num_beam_groups,
+                    generation_config.num_beams,
                 )
             )
         if generation_config.forced_bos_token_id is not None:
@@ -2559,28 +2549,22 @@ def generate(
 
         elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH:
             logger.warning_once(
-                "Group Beam Search is scheduled to be moved to a `custom_generate` repository in v4.55.0. "
-                "To prevent loss of backward compatibility, add `trust_remote_code=True` to your `generate` call."
-            )
-            # 11. prepare beam search scorer
-            beam_scorer = BeamSearchScorer(
-                batch_size=batch_size,
-                num_beams=generation_config.num_beams,
-                device=inputs_tensor.device,
-                length_penalty=generation_config.length_penalty,
-                do_early_stopping=generation_config.early_stopping,
-                num_beam_hyps_to_keep=generation_config.num_return_sequences,
-                num_beam_groups=generation_config.num_beam_groups,
-                max_length=generation_config.max_length,
+                "Group Beam Search was moved to a `custom_generate` repo: https://hf.co/transformers-community/group-beam-search. "
+                "To prevent loss of backward compatibility, add `custom_generate='transformers-community/group-beam-search'` "
+                "to your `generate` call before v4.62.0."
             )
-            result = self._group_beam_search(
-                input_ids,
-                beam_scorer,
-                logits_processor=prepared_logits_processor,
-                stopping_criteria=prepared_stopping_criteria,
+            if not trust_remote_code:
+                raise ValueError(
+                    "Group Beam Search requires `trust_remote_code=True` in your `generate` call, since "
+                    "it loads https://hf.co/transformers-community/group-beam-search."
+                )
+            return GenerationMixin.generate(
+                self,
+                inputs,
+                custom_generate="transformers-community/group-beam-search",
                 generation_config=generation_config,
-                synced_gpus=synced_gpus,
-                **model_kwargs,
+                trust_remote_code=trust_remote_code,
+                **kwargs,
             )
 
         elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH:
@@ -3527,301 +3511,6 @@ def _beam_search(
         else:
             return sequences
 
-    def _group_beam_search(
-        self,
-        input_ids: torch.LongTensor,
-        beam_scorer: BeamScorer,
-        logits_processor: LogitsProcessorList,
-        stopping_criteria: StoppingCriteriaList,
-        generation_config: GenerationConfig,
-        synced_gpus: bool,
-        **model_kwargs,
-    ):
-        r"""
-        Generates sequences of token ids for models with a language modeling head using **diverse beam search
-        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-        Parameters:
-            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
-                The sequence used as a prompt for the generation.
-            beam_scorer (`BeamScorer`):
-                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
-                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
-            logits_processor (`LogitsProcessorList`):
-                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-                used to modify the prediction scores of the language modeling head applied at each generation step.
-            stopping_criteria (`StoppingCriteriaList`):
-                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
-                used to tell if the generation loop should stop.
-            generation_config ([`~generation.GenerationConfig`]):
-                The generation configuration to be used as parametrization of the decoding method.
-            synced_gpus (`bool`):
-                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
-                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
-            model_kwargs:
-                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
-                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
-
-        Return:
-            [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
-            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
-            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
-            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
-            `model.config.is_encoder_decoder=True`.
-        """
-        # init values
-        pad_token_id = generation_config._pad_token_tensor
-        eos_token_id = generation_config._eos_token_tensor
-        output_attentions = generation_config.output_attentions
-        output_hidden_states = generation_config.output_hidden_states
-        output_scores = generation_config.output_scores
-        output_logits = generation_config.output_logits
-        return_dict_in_generate = generation_config.return_dict_in_generate
-
-        num_beams = beam_scorer.num_beams
-        num_beam_groups = beam_scorer.num_beam_groups
-        num_sub_beams = num_beams // num_beam_groups
-        batch_size = len(beam_scorer._beam_hyps) // num_beam_groups
-        device = input_ids.device
-
-        batch_beam_size, cur_len = input_ids.shape
-        model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
-
-        if return_dict_in_generate and output_scores:
-            beam_indices = [tuple(() for _ in range(num_sub_beams * batch_size)) for _ in range(num_beam_groups)]
-        else:
-            beam_indices = None
-
-        if num_beams * batch_size != batch_beam_size:
-            raise ValueError(
-                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
-            )
-
-        # init attention / hidden states / scores tuples
-        scores = () if (return_dict_in_generate and output_scores) else None
-        raw_logits = () if (return_dict_in_generate and output_logits) else None
-        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
-        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
-        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
-        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
-        if return_dict_in_generate and self.config.is_encoder_decoder:
-            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
-            encoder_hidden_states = (
-                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
-            )
-
-        # initialise score of first beam of each group with 0 and the rest with -1e9. This ensures that the beams in
-        # the same group don't produce same tokens every time.
-        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
-        beam_scores[:, ::num_sub_beams] = 0
-        beam_scores = beam_scores.view((batch_size * num_beams,))
-
-        this_peer_finished = False
-
-        decoder_prompt_len = input_ids.shape[1]  # record the prompt length of decoder
-        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
-            # predicted tokens in cur_len step
-            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
-
-            # indices which will form the beams in the next time step
-            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
-
-            # do one decoder step on all beams of all sentences in batch
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
-            # prepare variable output controls (note: some models won't accept all output controls)
-            model_inputs.update({"output_attentions": output_attentions} if output_attentions else {})
-            model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {})
-
-            outputs = self(**model_inputs, return_dict=True)
-
-            # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs,
-                model_kwargs,
-                is_encoder_decoder=self.config.is_encoder_decoder,
-            )
-            if synced_gpus and this_peer_finished:
-                cur_len = cur_len + 1
-                continue
-
-            if output_scores:
-                processed_score = torch.zeros_like(outputs.logits[:, -1, :])
-            if output_logits:
-                # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
-                # (the clone itself is always small)
-                raw_logit_score = outputs.logits[:, -1, :].to(copy=True, device=input_ids.device)
-
-            for beam_group_idx in range(num_beam_groups):
-                group_start_idx = beam_group_idx * num_sub_beams
-                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
-                group_size = group_end_idx - group_start_idx
-
-                # indices of beams of current group among all sentences in batch
-                batch_group_indices = []
-
-                for batch_idx in range(batch_size):
-                    batch_group_indices.extend(
-                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
-                    )
-                group_input_ids = input_ids[batch_group_indices]
-
-                # select outputs of beams of current group only
-                # No need to clone() the logits here as they will not retain outputs.logits at the end of the loop
-                # .float() is needed to retain precision for later logits manipulations
-                next_token_logits = outputs.logits[batch_group_indices, -1, :].to(
-                    dtype=torch.float32, device=input_ids.device
-                )
-
-                next_token_scores = nn.functional.log_softmax(
-                    next_token_logits, dim=-1
-                )  # (batch_size * group_size, vocab_size)
-                vocab_size = next_token_scores.shape[-1]
-
-                next_token_scores_processed = logits_processor(
-                    group_input_ids, next_token_scores, current_tokens=current_tokens, beam_group_idx=beam_group_idx
-                )
-                next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
-                next_token_scores = next_token_scores.expand_as(next_token_scores_processed)
-
-                if output_scores:
-                    processed_score[batch_group_indices] = next_token_scores_processed
-
-                # reshape for beam search
-                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
-
-                # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
-                n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
-                next_token_scores, next_tokens = torch.topk(
-                    next_token_scores, max(2, 1 + n_eos_tokens) * group_size, dim=1, largest=True, sorted=True
-                )
-
-                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
-                next_tokens = next_tokens % vocab_size
-
-                # stateless
-                process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
-                beam_outputs = beam_scorer.process(
-                    group_input_ids,
-                    next_token_scores,
-                    next_tokens,
-                    next_indices,
-                    pad_token_id=pad_token_id,
-                    eos_token_id=eos_token_id,
-                    beam_indices=process_beam_indices,
-                    group_index=beam_group_idx,
-                    decoder_prompt_len=decoder_prompt_len,
-                )
-                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
-                beam_next_tokens = beam_outputs["next_beam_tokens"]
-                beam_idx = beam_outputs["next_beam_indices"]
-
-                if return_dict_in_generate and output_scores:
-                    beam_indices[beam_group_idx] = tuple(
-                        beam_indices[beam_group_idx][beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices[0]))
-                    )
-
-                input_ids[batch_group_indices] = group_input_ids[beam_idx]
-                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-                current_tokens[batch_group_indices] = group_input_ids[:, -1]
-
-                # (beam_idx // group_size) -> batch_idx
-                # (beam_idx % group_size) -> offset of idx inside the group
-                reordering_indices[batch_group_indices] = (
-                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor")
-                    + group_start_idx
-                    + (beam_idx % group_size)
-                )
-
-            # Store scores, attentions and hidden_states when required
-            if return_dict_in_generate:
-                if output_scores:
-                    scores += (processed_score,)
-                if output_logits:
-                    raw_logits += (raw_logit_score,)
-                if output_attentions:
-                    decoder_attentions += (
-                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
-                    )
-                    if self.config.is_encoder_decoder:
-                        cross_attentions += (outputs.cross_attentions,)
-
-                if output_hidden_states:
-                    decoder_hidden_states += (
-                        (outputs.decoder_hidden_states,)
-                        if self.config.is_encoder_decoder
-                        else (outputs.hidden_states,)
-                    )
-
-            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
-
-            # This is needed to properly delete outputs.logits which may be very large for first iteration
-            # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration
-            # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory
-            # (that way the memory peak does not include outputs.logits)
-            del outputs
-
-            # NOTE: we need to check if `self._reorder_cache` exists for special models like RAG, RecurrentGemma etc.
-            if model_kwargs.get("past_key_values", None) is not None:
-                if hasattr(self, "_reorder_cache"):
-                    model_kwargs["past_key_values"] = self._reorder_cache(
-                        model_kwargs["past_key_values"], reordering_indices
-                    )
-                else:
-                    model_kwargs["past_key_values"].reorder_cache(reordering_indices)
-
-            # increase cur_len
-            cur_len = cur_len + 1
-
-            if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
-                this_peer_finished = True
-
-        final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
-        sequence_outputs = beam_scorer.finalize(
-            input_ids,
-            beam_scores,
-            next_tokens,
-            next_indices,
-            pad_token_id=pad_token_id,
-            eos_token_id=eos_token_id,
-            max_length=stopping_criteria.max_length,
-            beam_indices=final_beam_indices,
-            decoder_prompt_len=decoder_prompt_len,
-        )
-
-        if return_dict_in_generate:
-            if not output_scores:
-                sequence_outputs["sequence_scores"] = None
-
-            if self.config.is_encoder_decoder:
-                return GenerateBeamEncoderDecoderOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    encoder_attentions=encoder_attentions,
-                    encoder_hidden_states=encoder_hidden_states,
-                    decoder_attentions=decoder_attentions,
-                    cross_attentions=cross_attentions,
-                    decoder_hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-            else:
-                return GenerateBeamDecoderOnlyOutput(
-                    sequences=sequence_outputs["sequences"],
-                    sequences_scores=sequence_outputs["sequence_scores"],
-                    scores=scores,
-                    logits=raw_logits,
-                    beam_indices=sequence_outputs["beam_indices"],
-                    attentions=decoder_attentions,
-                    hidden_states=decoder_hidden_states,
-                    past_key_values=model_kwargs.get("past_key_values"),
-                )
-        else:
-            return sequence_outputs["sequences"]
-
     def _constrained_beam_search(
         self,
         input_ids: torch.LongTensor,
diff --git a/src/transformers/models/dia/generation_dia.py b/src/transformers/models/dia/generation_dia.py
index 7cac22f0d483..22b607ec2865 100644
--- a/src/transformers/models/dia/generation_dia.py
+++ b/src/transformers/models/dia/generation_dia.py
@@ -400,7 +400,7 @@ def _main_generate_loop(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
     @torch.no_grad()
diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py
index 8ee43bd29184..d241268bd3cd 100644
--- a/src/transformers/models/janus/modeling_janus.py
+++ b/src/transformers/models/janus/modeling_janus.py
@@ -1270,7 +1270,7 @@ def generate(
         if generation_config.get_generation_mode() not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
             raise ValueError(
                 "Got incompatible mode for Image Generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         # Validate the configuration and model kwargs
diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
index 2ae65710b2a5..0d15572b527b 100644
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -1130,7 +1130,7 @@ def generate(
         if generation_config.get_generation_mode() not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
             raise ValueError(
                 "Got incompatible mode for Image Generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         # Validate the configuration and model kwargs
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index c8df024e4e5a..d91a198da816 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1321,7 +1321,7 @@ def generate(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         if generation_config.return_dict_in_generate:
@@ -2371,7 +2371,7 @@ def generate(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         if generation_config.return_dict_in_generate:
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 28ef14ee15fe..ec544cb88804 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -1235,7 +1235,7 @@ def generate(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         if generation_config.return_dict_in_generate:
@@ -2236,7 +2236,7 @@ def generate(
         else:
             raise ValueError(
                 "Got incompatible mode for generation, should be one of greedy or sampling. "
-                "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`."
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
             )
 
         if generation_config.return_dict_in_generate:
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c892fb365cbc..ce3fd036aef2 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -177,13 +177,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class BeamSearchScorer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class ClassifierFreeGuidanceLogitsProcessor(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -282,13 +275,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class HammingDiversityLogitsProcessor(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class InfNanRemoveLogitsProcessor(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/generation/test_beam_search.py b/tests/generation/test_beam_search.py
index bd791bbc8fb8..69bb0a40e292 100644
--- a/tests/generation/test_beam_search.py
+++ b/tests/generation/test_beam_search.py
@@ -26,230 +26,12 @@
 
     from transformers.generation import (
         BeamHypotheses,
-        BeamSearchScorer,
         ConstrainedBeamSearchScorer,
         DisjunctiveConstraint,
         PhrasalConstraint,
     )
 
 
-class BeamSearchTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=3,
-        sequence_length=10,
-        vocab_size=99,
-        pad_token_id=0,
-        max_length=20,
-        num_beams=4,
-        length_penalty=2.0,
-        do_early_stopping=True,
-        num_beam_hyps_to_keep=2,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.sequence_length = sequence_length
-        self.vocab_size = vocab_size
-        self.pad_token_id = pad_token_id
-        self.max_length = max_length
-        self.num_beams = num_beams
-        self.length_penalty = length_penalty
-        self.do_early_stopping = do_early_stopping
-        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
-
-        # cannot be randomly generated
-        self.eos_token_id = vocab_size + 1
-
-    def prepare_beam_scorer(self, **kwargs):
-        return BeamSearchScorer(
-            batch_size=kwargs.get("batch_size", self.batch_size),
-            num_beams=kwargs.get("num_beams", self.num_beams),
-            device=torch_device,
-            length_penalty=kwargs.get("length_penalty", self.length_penalty),
-            do_early_stopping=kwargs.get("do_early_stopping", self.do_early_stopping),
-            num_beam_hyps_to_keep=kwargs.get("num_beam_hyps_to_keep", self.num_beam_hyps_to_keep),
-        )
-
-    def prepare_inputs(self):
-        input_ids = ids_tensor((self.batch_size * self.num_beams, self.sequence_length), self.vocab_size)
-        next_tokens = ids_tensor((self.batch_size, 2 * self.num_beams), self.vocab_size).to(torch_device)
-        next_indices = ids_tensor((self.batch_size, 2 * self.num_beams), self.num_beams).to(torch_device)
-        next_scores, _ = (-floats_tensor((self.batch_size, 2 * self.num_beams)).to(torch_device)).sort(descending=True)
-        return (input_ids, next_tokens, next_indices, next_scores)
-
-    def check_beam_hypotheses(self, input_ids, *args):
-        # check that correct number of beam hypotheses is set in beam scorer
-        beam_scorer = self.prepare_beam_scorer(do_early_stopping=True)
-        beam_hyp = beam_scorer._beam_hyps[0]
-
-        self.parent.assertEqual(len(beam_scorer._beam_hyps), self.batch_size)
-
-        # check correct type
-        self.parent.assertTrue(isinstance(beam_hyp, BeamHypotheses))
-
-        # check that num_beams is correctly set
-        self.parent.assertEqual(beam_hyp.num_beams, self.num_beams)
-
-        # check for early stopping deactivated
-        for beam_idx in range(self.num_beams):
-            beam_hyp.add(input_ids[beam_idx], -10.0)
-
-        # if early stopping True -> score does not matter
-        self.parent.assertTrue(beam_hyp.is_done(-10.0, 5))
-
-        # re-init
-        beam_scorer = self.prepare_beam_scorer(do_early_stopping=False)
-        beam_hyp = beam_scorer._beam_hyps[0]
-
-        # add `num_beams + 1` beams to change `worst_score`
-        for beam_idx in range(self.num_beams + 1):
-            beam_hyp.add(input_ids[beam_idx], -10.0 + float(beam_idx))
-
-        # -10.0 is removed => -9.0 is worst score
-        self.parent.assertAlmostEqual(beam_hyp.worst_score, -9.0 / (self.sequence_length**beam_hyp.length_penalty))
-
-        # -5.0 is better than worst score => should not be finished
-        self.parent.assertFalse(beam_hyp.is_done(-5.0, self.sequence_length))
-
-        # -20.0 is worse than worst score => should be finished
-        self.parent.assertTrue(beam_hyp.is_done(-20.0, self.sequence_length))
-
-    def check_beam_scorer_update(self, input_ids, next_tokens, next_indices, next_scores):
-        # check too many eos tokens
-        beam_scorer = self.prepare_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[0, :] = self.eos_token_id
-
-        with self.parent.assertRaises(ValueError):
-            beam_scorer.process(input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id)
-
-        # check all batches are done
-        beam_scorer = self.prepare_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[:, : self.num_beams] = self.eos_token_id
-        beam_indices = torch.zeros_like(input_ids) + torch.arange(input_ids.shape[-1], device=input_ids.device)
-        beam_indices = tuple(tuple(b) for b in beam_indices)
-        beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id, beam_indices=beam_indices
-        )
-        # beam scorer should be done
-        self.parent.assertTrue(beam_scorer.is_done)
-
-        # check
-        beam_scorer = self.prepare_beam_scorer()
-
-        tokens = next_tokens.clone()
-        tokens[:, 1] = self.eos_token_id
-        beam_outputs = beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id, beam_indices=beam_indices
-        )
-        output_scores = beam_outputs["next_beam_scores"]
-        output_tokens = beam_outputs["next_beam_tokens"]
-        output_indices = beam_outputs["next_beam_indices"]
-
-        def cut_expected_tensor(tensor):
-            return torch.cat([tensor[:, :1], tensor[:, 2 : self.num_beams + 1]], dim=1).flatten()
-
-        # check all outptus
-        # cut out id of eos token and take best `num_beams` outputs
-        expected_output_tokens = cut_expected_tensor(tokens)
-        expected_output_scores = cut_expected_tensor(next_scores)
-
-        # add num_beams * batch_idx
-        offset = torch.div(
-            torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams, rounding_mode="floor"
-        )
-        expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams
-
-        self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist())
-        self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist())
-        self.parent.assertTrue(torch.allclose(expected_output_scores, output_scores, atol=1e-3))
-
-        # make sure ids of eos token are correctly saved in beam_hyps of beam scorer
-        expected_beam_indices = list(range(10))
-        for batch_idx in range(self.batch_size):
-            correct_idx = batch_idx * self.num_beams + next_indices[batch_idx, 1]
-            self.parent.assertListEqual(
-                input_ids[correct_idx].tolist(), beam_scorer._beam_hyps[batch_idx].beams[0][1].tolist()
-            )
-            self.parent.assertListEqual(
-                expected_beam_indices + [correct_idx],
-                torch.tensor(beam_scorer._beam_hyps[batch_idx].beams[0][2]).tolist(),
-            )
-
-    def check_beam_scores_finalize(self, input_ids, next_tokens, next_indices, next_scores):
-        # max_length should be only one more than current input_ids to check that eos is correctly appended
-        max_length = self.sequence_length + 1
-        beam_scorer = self.prepare_beam_scorer(num_beam_hyps_to_keep=1, length_penalty=1.0, do_early_stopping=False)
-
-        # update beams and append to input_ids
-        tokens = next_tokens.clone()
-        # first batch, first output has to finish with eos token id since scores are correctly sorted
-        tokens[0, 0] = self.eos_token_id
-        # make sure corresponding score is as good as possible to surely be picked first
-        next_scores[0, 0] = 0.0
-        beam_outputs = beam_scorer.process(
-            input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id
-        )
-        output_scores = beam_outputs["next_beam_scores"]
-        output_tokens = beam_outputs["next_beam_tokens"]
-        output_indices = beam_outputs["next_beam_indices"]
-
-        input_ids = torch.cat([input_ids[output_indices, :], output_tokens.unsqueeze(-1)], dim=-1)
-
-        # finalize
-        beam_indices = torch.zeros_like(input_ids) + torch.arange(input_ids.shape[-1], device=input_ids.device)
-        beam_indices = tuple(tuple(b) for b in beam_indices)
-        sequence_output = beam_scorer.finalize(
-            input_ids,
-            output_scores,
-            output_tokens,
-            output_indices,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-            max_length=max_length,
-            beam_indices=beam_indices,
-        )
-
-        sequences = sequence_output["sequences"]
-        sequence_scores = sequence_output["sequence_scores"]
-
-        # since `num_beam_hyps_to_keep` = 1 => only return `batch_size` x `max_length`
-        self.parent.assertListEqual(list(sequences.shape), [self.batch_size, max_length])
-        self.parent.assertListEqual(list(sequence_scores.shape), [self.batch_size])
-
-        # check sequence_scores
-        self.parent.assertFalse((sequence_scores > 0).any().item())
-
-        # first batch has to finish with eos_token
-        self.parent.assertEqual(sequences[0, -1].item(), self.eos_token_id)
-
-        # other batches cannot finish with eos token
-        self.parent.assertNotEqual(sequences[1, -1].item(), self.eos_token_id)
-        self.parent.assertNotEqual(sequences[2, -1].item(), self.eos_token_id)
-
-        # now test that if `num_beam_hyps_to_keep` is 3 => all beams are returned
-        beam_scorer.num_beam_hyps_to_keep = self.num_beams
-        sequence_output = beam_scorer.finalize(
-            input_ids,
-            output_scores,
-            output_tokens,
-            output_indices,
-            pad_token_id=self.pad_token_id,
-            eos_token_id=self.eos_token_id,
-            max_length=max_length,
-            beam_indices=beam_indices,
-        )
-        sequences = sequence_output["sequences"]
-        sequence_scores = sequence_output["sequence_scores"]
-
-        self.parent.assertListEqual(list(sequences.shape), [self.num_beams * self.batch_size, max_length])
-        self.parent.assertListEqual(list(sequence_scores.shape), [self.num_beams * self.batch_size])
-
-
 class ConstrainedBeamSearchTester:
     def __init__(
         self,
@@ -540,24 +322,6 @@ def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
         return flag
 
 
-@require_torch
-class BeamSearchTest(unittest.TestCase):
-    def setUp(self):
-        self.beam_search_tester = BeamSearchTester(self)
-
-    def test_beam_hypotheses(self):
-        inputs = self.beam_search_tester.prepare_inputs()
-        self.beam_search_tester.check_beam_hypotheses(*inputs)
-
-    def test_beam_scorer_update(self):
-        inputs = self.beam_search_tester.prepare_inputs()
-        self.beam_search_tester.check_beam_scorer_update(*inputs)
-
-    def test_beam_scorer_finalize(self):
-        inputs = self.beam_search_tester.prepare_inputs()
-        self.beam_search_tester.check_beam_scores_finalize(*inputs)
-
-
 @require_torch
 class ConstrainedBeamSearchTest(unittest.TestCase):
     def setUp(self):
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index 429d61cbd26a..e67e5ba325e0 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -39,7 +39,6 @@
     ForcedBOSTokenLogitsProcessor,
     ForcedEOSTokenLogitsProcessor,
     GenerationMode,
-    HammingDiversityLogitsProcessor,
     MinLengthLogitsProcessor,
     MinNewTokensLengthLogitsProcessor,
     MinPLogitsWarper,
@@ -536,31 +535,6 @@ def prefix_allowed_tokens_fn(batch_id, inputs_ids):
         )
         self.assertEqual(prefix_constrained_logits_proc._num_beams, num_beams)
 
-    def test_serialize_generation_diversity_penalty_and_num_bean_groups(self):
-        """Tests that GenerationConfig is serialized and HammingDiversityLogitsProcessor is initialized with diversity_penalty_and_num_bean_groups"""
-        num_beams = 2
-        num_beam_groups = 2
-        diversity_penalty = 1.0
-
-        generation_config = GenerationConfig(
-            num_beams=num_beams, diversity_penalty=diversity_penalty, num_beam_groups=num_beam_groups
-        )
-        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
-            generation_config.save_pretrained(tmp_dir)
-            new_config = GenerationConfig.from_pretrained(tmp_dir)
-        self.assertEqual(new_config.num_beams, num_beams)
-        self.assertEqual(new_config.diversity_penalty, diversity_penalty)
-        self.assertEqual(new_config.num_beam_groups, num_beam_groups)
-
-        diversity_logits_processor = HammingDiversityLogitsProcessor(
-            diversity_penalty=new_config.diversity_penalty,
-            num_beams=new_config.num_beams,
-            num_beam_groups=new_config.num_beam_groups,
-        )
-        self.assertEqual(diversity_logits_processor._num_beams, num_beams)
-        self.assertEqual(diversity_logits_processor._diversity_penalty, diversity_penalty)
-        self.assertEqual(diversity_logits_processor._num_sub_beams, num_beams // num_beam_groups)
-
     def test_serialize_generation_bos_token_id(self):
         """Tests that GenerationConfig is serialized and ForcedBOSTokenLogitsProcessor is initialized with bos_token_id"""
         bos_token_id = 0
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index df68f9c62100..768e216ef534 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -36,7 +36,6 @@
         ExponentialDecayLengthPenalty,
         ForcedBOSTokenLogitsProcessor,
         ForcedEOSTokenLogitsProcessor,
-        HammingDiversityLogitsProcessor,
         InfNanRemoveLogitsProcessor,
         LogitNormalization,
         LogitsProcessorList,
@@ -796,36 +795,6 @@ def empty_prefix_allowed_tokens_fn(batch_id, inputs_ids):
         # processor should not change logits in-place
         self.assertFalse(torch.all(scores == filtered_scores))
 
-    def test_hamming_diversity(self):
-        vocab_size = 4
-        num_beams = 2
-        num_beam_groups = 2
-
-        scores = self._get_uniform_logits(num_beams, vocab_size)
-        # batch_idx = 0 -> index batch_idx * num_beam_groups -> idx = 0 * 2 = 0 -> penalises tokens 1
-        # batch_idx = 1 -> index batch_idx * num_beam_groups -> idx = 1 * 2 = 2 -> penalises tokens 1
-        current_tokens = torch.tensor([0, 3, 1, 2], device=torch_device, dtype=torch.long)
-
-        diversity_logits_processor = HammingDiversityLogitsProcessor(
-            diversity_penalty=1.0, num_beams=num_beams, num_beam_groups=num_beam_groups
-        )
-
-        processed_scores = diversity_logits_processor(None, scores, current_tokens, 1)
-
-        self.assertTrue(
-            torch.allclose(
-                processed_scores[0], torch.tensor([-0.7500, 0.2500, 0.2500, 0.2500], device=torch_device), atol=1e-3
-            )
-        )
-        self.assertTrue(
-            torch.allclose(
-                processed_scores[1], torch.tensor([0.2500, -0.7500, 0.2500, 0.2500], device=torch_device), atol=1e-3
-            )
-        )
-
-        # processor should not change logits in-place
-        self.assertFalse(torch.all(scores == processed_scores))
-
     def test_forced_bos_token_logits_processor(self):
         vocab_size = 20
         batch_size = 4
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 449d8122c12b..bfb1ba4111dd 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -209,17 +209,6 @@ def _get_beam_kwargs(self, num_return_sequences=1):
         }
         return beam_kwargs
 
-    def _get_diverse_beam_kwargs(self, num_return_sequences=1):
-        beam_kwargs = {
-            "early_stopping": False,
-            "length_penalty": 2.0,
-            "num_beams": 2,
-            "num_return_sequences": num_return_sequences,
-            "num_beam_groups": 2,  # one beam per group
-            "diversity_penalty": 2.0,
-        }
-        return beam_kwargs
-
     def _get_constrained_beam_kwargs(self, num_return_sequences=1):
         beam_kwargs = {
             "early_stopping": False,
@@ -351,36 +340,6 @@ def _beam_sample_generate(
 
         return output_generate
 
-    def _group_beam_search_generate(
-        self,
-        model,
-        inputs_dict,
-        beam_kwargs,
-        output_scores=False,
-        output_logits=False,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict_in_generate=False,
-        use_cache=True,
-    ):
-        logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config)
-        output_generate = model.generate(
-            do_sample=False,
-            max_new_tokens=self.max_new_tokens,
-            min_new_tokens=self.max_new_tokens,
-            output_scores=output_scores,
-            output_logits=output_logits,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict_in_generate=return_dict_in_generate,
-            use_cache=use_cache,
-            **beam_kwargs,
-            **logits_processor_kwargs,
-            **inputs_dict,
-        )
-
-        return output_generate
-
     def _constrained_beam_search_generate(
         self,
         model,
@@ -747,77 +706,6 @@ def test_generate_without_input_ids(self):
             )
             self.assertIsNotNone(output_ids_generate)
 
-    @pytest.mark.generate
-    def test_group_beam_search_generate(self):
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-
-            model = model_class(config).to(torch_device).eval()
-            # check `generate()` and `group_beam_search()` are equal
-            beam_kwargs = self._get_diverse_beam_kwargs()
-            output_generate = self._group_beam_search_generate(
-                model=model,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
-
-            # check `group_beam_search` for higher than 1 `num_return_sequences`
-            num_return_sequences = 2
-            beam_kwargs = self._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences)
-            output_generate = self._group_beam_search_generate(
-                model=model,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1)
-            else:
-                self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
-
-    @pytest.mark.generate
-    def test_group_beam_search_generate_dict_output(self):
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            if self.has_attentions:
-                config._attn_implementation = "eager"  # can't output attentions otherwise
-
-            model = model_class(config).to(torch_device).eval()
-            beam_kwargs = self._get_diverse_beam_kwargs()
-            output_generate = self._group_beam_search_generate(
-                model=model,
-                inputs_dict=inputs_dict,
-                beam_kwargs=beam_kwargs,
-                output_scores=True,
-                output_logits=True,
-                output_hidden_states=True,
-                output_attentions=self.has_attentions,
-                return_dict_in_generate=True,
-                use_cache=False,
-            )
-            if model.config.get_text_config(decoder=True).is_encoder_decoder:
-                self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1)
-                self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput)
-            else:
-                self.assertTrue(
-                    output_generate.sequences.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1]
-                )
-                self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput)
-                # Retrocompatibility check
-                self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
-
-            self._check_generate_outputs(
-                output_generate,
-                model.config,
-                num_return_sequences=beam_kwargs["num_return_sequences"],
-                num_beams=beam_kwargs["num_beams"],
-            )
-
     @is_flaky()  # Some models have position-specific tokens, this test may try to force them in an invalid position
     @pytest.mark.generate
     def test_constrained_beam_search_generate(self):
@@ -2651,6 +2539,7 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
 @pytest.mark.generate
 @require_torch
 class GenerationIntegrationTests(unittest.TestCase):
+    # TODO joao, manuel: remove in v4.62.0
     @slow
     def test_diverse_beam_search(self):
         article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
@@ -2669,6 +2558,8 @@ def test_diverse_beam_search(self):
             num_beam_groups=4,
             diversity_penalty=2.0,
             remove_invalid_values=True,
+            trust_remote_code=True,
+            custom_generate="transformers-community/group-beam-search",
         )
 
         generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
@@ -2828,6 +2719,7 @@ def test_generate_input_values_as_encoder_kwarg(self):
         self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist())
         self.assertEqual(output_sequences.shape, (2, 5))
 
+    # TODO joao, manuel: remove in v4.62.0
     def test_transition_scores_group_beam_search_encoder_decoder(self):
         articles = [
             "Justin Timberlake and Jessica Biel, welcome to parenthood.",
@@ -2836,12 +2728,14 @@ def test_transition_scores_group_beam_search_encoder_decoder(self):
         tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
         model = BartForConditionalGeneration.from_pretrained(
             "hf-internal-testing/tiny-random-bart",
+            eos_token_id=None,
+        )
+        generation_config = GenerationConfig(
             max_length=10,
             num_beams=2,
             num_beam_groups=2,
             num_return_sequences=2,
             diversity_penalty=1.0,
-            eos_token_id=None,
             return_dict_in_generate=True,
             output_scores=True,
             length_penalty=0.0,
@@ -2849,7 +2743,12 @@ def test_transition_scores_group_beam_search_encoder_decoder(self):
         model = model.to(torch_device)
 
         input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
-        outputs = model.generate(input_ids=input_ids)
+        outputs = model.generate(
+            input_ids=input_ids,
+            generation_config=generation_config,
+            trust_remote_code=True,
+            custom_generate="transformers-community/group-beam-search",
+        )
 
         transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices)
         transition_scores_sum = transition_scores.sum(-1)
@@ -4820,6 +4719,16 @@ def test_generate_custom_cache_position(self):
         [
             ("transformers-community/dola", {"dola_layers": "low"}),
             ("transformers-community/contrastive-search", {"penalty_alpha": 0.6, "top_k": 4}),
+            (
+                "transformers-community/group-beam-search",
+                {
+                    "do_sample": False,
+                    "num_beams": 2,
+                    "num_beam_groups": 2,
+                    "diversity_penalty": 2.0,
+                    "length_penalty": 2.0,
+                },
+            ),
         ]
     )
     def test_hub_gen_strategies(self, custom_generate, extra_kwargs):
diff --git a/tests/models/csm/test_modeling_csm.py b/tests/models/csm/test_modeling_csm.py
index f81685abd091..d77a86a201cb 100644
--- a/tests/models/csm/test_modeling_csm.py
+++ b/tests/models/csm/test_modeling_csm.py
@@ -272,16 +272,6 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
     def test_beam_sample_generate_dict_output(self):
         pass
 
-    @pytest.mark.generate
-    @unittest.skip(reason="CSM does not support group beam search.")
-    def test_group_beam_search_generate(self):
-        pass
-
-    @pytest.mark.generate
-    @unittest.skip(reason="CSM does not support group beam search.")
-    def test_group_beam_search_generate_dict_output(self):
-        pass
-
     @pytest.mark.generate
     @unittest.skip(reason="CSM does not support constrained beam search.")
     def test_constrained_beam_search_generate(self):
diff --git a/tests/models/dia/test_modeling_dia.py b/tests/models/dia/test_modeling_dia.py
index 5f51649619fe..2f09b65cf8f3 100644
--- a/tests/models/dia/test_modeling_dia.py
+++ b/tests/models/dia/test_modeling_dia.py
@@ -237,7 +237,6 @@ def skip_non_greedy_generate(self):
         skippable_tests = [
             "test_sample_generate_dict_output",  # return sequences > 1
             "test_beam",
-            "test_group_beam",
             "test_constrained_beam",
             "test_contrastive",
             "test_assisted",
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index df8d2d6e508a..bcb7259004ba 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -138,16 +138,6 @@ def test_constrained_beam_search_generate_dict_output(self):
     def test_generate_without_input_ids(self):
         pass
 
-    @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests")
-    @pytest.mark.generate
-    def test_group_beam_search_generate(self):
-        pass
-
-    @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests")
-    @pytest.mark.generate
-    def test_group_beam_search_generate_dict_output(self):
-        pass
-
     @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests")
     @pytest.mark.generate
     def test_constrained_beam_search_generate(self):
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index 808cef6ddcef..8682c1e75c58 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -401,13 +401,6 @@ def test_greedy_generate_dict_outputs(self):
         super().test_greedy_generate_dict_outputs()
         self.has_attentions = old_has_attentions
 
-    def test_group_beam_search_generate_dict_output(self):
-        # This model has a custom attention output shape AND config flags, let's skip those checks
-        old_has_attentions = self.has_attentions
-        self.has_attentions = False
-        super().test_group_beam_search_generate_dict_output()
-        self.has_attentions = old_has_attentions
-
     def test_sample_generate_dict_output(self):
         # This model has a custom attention output shape AND config flags, let's skip those checks
         old_has_attentions = self.has_attentions
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index bc5b065d918b..a2dcccddb929 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -403,12 +403,6 @@ def _get_beam_kwargs(self, num_return_sequences=1):
         beam_kwargs["num_return_sequences"] = beam_kwargs["num_beams"]
         return beam_kwargs
 
-    def _get_diverse_beam_kwargs(self, num_return_sequences=1):
-        # Overwritten from `GenerationTesterMixin`, Whisper's `num_return_sequences` differs from the core `generate`
-        beam_kwargs = super()._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences)
-        beam_kwargs["num_return_sequences"] = beam_kwargs["num_beams"]
-        return beam_kwargs
-
     def _get_constrained_beam_kwargs(self, num_return_sequences=1):
         # Overwritten from `GenerationTesterMixin`, Whisper's `num_return_sequences` differs from the core `generate`
         beam_kwargs = super()._get_constrained_beam_kwargs(num_return_sequences=num_return_sequences)
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index d6a34a361354..9ac8e462e8e3 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -434,9 +434,7 @@ def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self):
         inputs = tokenizer(input_text, return_tensors="pt").to(device)
         common = {
             "num_beams": 4,
-            "num_beam_groups": 2,
             "num_return_sequences": 4,
-            "diversity_penalty": 1.0,
             "max_new_tokens": 20,
             "early_stopping": True,
         }