diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 5c7d27192292..63b70899af4d 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -225,28 +225,6 @@ outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=to tokenizer.batch_decode(outputs, skip_special_tokens=True) ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] ``` -### Diverse beam search - -[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups. - -Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`). - -```py -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device - -device = infer_device() - -tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") -inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device) - -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.float16).to(device) -# explicitly set to 100 because Llama2 generation length is 4096 -outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False) -tokenizer.batch_decode(outputs, skip_special_tokens=True) -'Hugging Face is an open-source company ๐Ÿค—\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a' -``` - ## Custom generation methods diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index ecd4e77fc5f7..9deb926b905f 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -108,9 +108,6 @@ generation. [[autodoc]] ForcedEOSTokenLogitsProcessor - __call__ -[[autodoc]] HammingDiversityLogitsProcessor - - __call__ - [[autodoc]] InfNanRemoveLogitsProcessor - __call__ @@ -219,10 +216,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens - process - finalize -[[autodoc]] BeamSearchScorer - - process - - finalize - [[autodoc]] ConstrainedBeamSearchScorer - process - finalize diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md index f58d4a995e80..f7bcd3252493 100644 --- a/docs/source/en/kv_cache.md +++ b/docs/source/en/kv_cache.md @@ -146,7 +146,7 @@ tokenizer = AutoTokenizer.from_pretrained(ckpt) model = AutoModelForCausalLM.from_pretrained(ckpt, dtype=torch.float16, device_map="auto") prompt = ["okay "*1000 + "Fun fact: The most"] inputs = tokenizer(prompt, return_tensors="pt").to(model.device) -beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, } +beams = { "num_beams": 40, "num_return_sequences": 20, "max_new_tokens": 23, "early_stopping": True, } out = resilient_generate(model, **inputs, **beams) responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True) ``` diff --git a/docs/source/ja/generation_strategies.md b/docs/source/ja/generation_strategies.md index 856c4856c52f..45eec30c0765 100644 --- a/docs/source/ja/generation_strategies.md +++ b/docs/source/ja/generation_strategies.md @@ -241,43 +241,6 @@ time."\n\nHe added: "I am very proud of the work I have been able to do in the l 'Das Haus ist wunderbar.' ``` -### Diverse beam search decoding - -ๅคšๆง˜ใชใƒ“ใƒผใƒ ใ‚ตใƒผใƒใƒ‡ใ‚ณใƒผใƒ‡ใ‚ฃใƒณใ‚ฐๆˆฆ็•ฅใฏใ€ใƒ“ใƒผใƒ ใ‚ตใƒผใƒๆˆฆ็•ฅใฎๆ‹กๅผตใงใ‚ใ‚Šใ€้ธๆŠž่‚ขใ‹ใ‚‰ใ‚ˆใ‚Šๅคšๆง˜ใชใƒ“ใƒผใƒ ใ‚ทใƒผใ‚ฑใƒณใ‚นใ‚’็”Ÿๆˆใงใใ‚‹ใ‚ˆใ†ใซใ—ใพใ™ใ€‚ใ“ใฎไป•็ต„ใฟใฎ่ฉณ็ดฐใซใคใ„ใฆใฏใ€[Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models](https://huggingface.co/papers/1610.02424) ใ‚’ใ”ๅ‚็…งใใ ใ•ใ„ใ€‚ใ“ใฎใ‚ขใƒ—ใƒญใƒผใƒใซใฏใ€`num_beams`ใ€`num_beam_groups`ใ€ใŠใ‚ˆใณ `diversity_penalty` ใจใ„ใ†3ใคใฎไธป่ฆใชใƒ‘ใƒฉใƒกใƒผใ‚ฟใŒใ‚ใ‚Šใพใ™ใ€‚ๅคšๆง˜ๆ€งใƒšใƒŠใƒซใƒ†ใ‚ฃใฏใ€ๅ‡บๅŠ›ใŒใ‚ฐใƒซใƒผใƒ—ใ”ใจใซ็•ฐใชใ‚‹ใ“ใจใ‚’ไฟ่จผใ—ใ€ใƒ“ใƒผใƒ ใ‚ตใƒผใƒใฏๅ„ใ‚ฐใƒซใƒผใƒ—ๅ†…ใงไฝฟ็”จใ•ใ‚Œใพใ™ใ€‚ - - -```python ->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - ->>> checkpoint = "google/pegasus-xsum" ->>> prompt = ( -... "The Permaculture Design Principles are a set of universal design principles " -... "that can be applied to any location, climate and culture, and they allow us to design " -... "the most efficient and sustainable human habitation and food production systems. " -... "Permaculture is a design system that encompasses a wide variety of disciplines, such " -... "as ecology, landscape design, environmental science and energy conservation, and the " -... "Permaculture design principles are drawn from these various disciplines. Each individual " -... "design principle itself embodies a complete conceptual framework based on sound " -... "scientific principles. When we bring all these separate principles together, we can " -... "create a design system that both looks at whole systems, the parts that these systems " -... "consist of, and how those parts interact with each other to create a complex, dynamic, " -... "living system. Each design principle serves as a tool that allows us to integrate all " -... "the separate parts of a design, referred to as elements, into a functional, synergistic, " -... "whole system, where the elements harmoniously interact and work together in the most " -... "efficient way possible." -... ) - ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") - ->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) - ->>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0) ->>> tokenizer.decode(outputs[0], skip_special_tokens=True) -'The Design Principles are a set of universal design principles that can be applied to any location, climate and -culture, and they allow us to design the' -``` - ### Assisted Decoding ใ‚ขใ‚ทใ‚นใƒˆใƒ‡ใ‚ณใƒผใƒ‡ใ‚ฃใƒณใ‚ฐใฏใ€ไธŠ่จ˜ใฎใƒ‡ใ‚ณใƒผใƒ‡ใ‚ฃใƒณใ‚ฐๆˆฆ็•ฅใ‚’ๅค‰ๆ›ดใ—ใŸใ‚‚ใฎใงใ€ๅŒใ˜ใƒˆใƒผใ‚ฏใƒŠใ‚คใ‚ถใƒผ๏ผˆ็†ๆƒณ็š„ใซใฏใฏใ‚‹ใ‹ใซๅฐใ•ใชใƒขใƒ‡ใƒซ๏ผ‰ใ‚’ไฝฟ็”จใ—ใฆใ€ใ„ใใคใ‹ใฎๅ€™่ฃœใƒˆใƒผใ‚ฏใƒณใ‚’่ฒชๆฌฒใซ็”Ÿๆˆใ™ใ‚‹ใ‚ขใ‚ทใ‚นใ‚ฟใƒณใƒˆใƒขใƒ‡ใƒซใ‚’ไฝฟ็”จใ—ใพใ™ใ€‚ใใฎๅพŒใ€ไธป่ฆใชใƒขใƒ‡ใƒซใฏๅ€™่ฃœใƒˆใƒผใ‚ฏใƒณใ‚’1ใคใฎๅ‰ๅ‘ใใƒ‘ใ‚นใงๆคœ่จผใ—ใ€ใƒ‡ใ‚ณใƒผใƒ‡ใ‚ฃใƒณใ‚ฐใƒ—ใƒญใ‚ปใ‚นใ‚’้ซ˜้€ŸๅŒ–ใ—ใพใ™ใ€‚็พๅœจใ€ใ‚ขใ‚ทใ‚นใƒˆใƒ‡ใ‚ณใƒผใƒ‡ใ‚ฃใƒณใ‚ฐใงใฏ่ฒชๆฌฒๆคœ็ดขใจใ‚ตใƒณใƒ—ใƒชใƒณใ‚ฐใฎใฟใŒใ‚ตใƒใƒผใƒˆใ•ใ‚ŒใฆใŠใ‚Šใ€ใƒใƒƒใƒๅ…ฅๅŠ›ใฏใ‚ตใƒใƒผใƒˆใ•ใ‚Œใฆใ„ใพใ›ใ‚“ใ€‚ใ‚ขใ‚ทใ‚นใƒˆใƒ‡ใ‚ณใƒผใƒ‡ใ‚ฃใƒณใ‚ฐใฎ่ฉณ็ดฐใซใคใ„ใฆใฏใ€[ใ“ใฎใƒ–ใƒญใ‚ฐ่จ˜ไบ‹](https://huggingface.co/blog/assisted-generation) ใ‚’ใ”่ฆงใใ ใ•ใ„ใ€‚ diff --git a/docs/source/ja/internal/generation_utils.md b/docs/source/ja/internal/generation_utils.md index 1a5cc1dec079..c01d86f54bc0 100644 --- a/docs/source/ja/internal/generation_utils.md +++ b/docs/source/ja/internal/generation_utils.md @@ -139,9 +139,6 @@ generation_output[:2] [[autodoc]] ForcedEOSTokenLogitsProcessor - __call__ -[[autodoc]] HammingDiversityLogitsProcessor - - __call__ - [[autodoc]] InfNanRemoveLogitsProcessor - __call__ @@ -321,10 +318,6 @@ generation_output[:2] - process - finalize -[[autodoc]] BeamSearchScorer - - process - - finalize - [[autodoc]] ConstrainedBeamSearchScorer - process - finalize diff --git a/docs/source/ko/generation_strategies.md b/docs/source/ko/generation_strategies.md index da38e4f418f2..c59eff4111f3 100644 --- a/docs/source/ko/generation_strategies.md +++ b/docs/source/ko/generation_strategies.md @@ -232,44 +232,6 @@ time."\n\nHe added: "I am very proud of the work I have been able to do in the l 'Das Haus ist wunderbar.' ``` -### ๋‹ค์–‘ํ•œ ๋น” ํƒ์ƒ‰ ๋””์ฝ”๋”ฉ(Diverse beam search decoding)[[diverse-beam-search-decoding]] - -๋‹ค์–‘ํ•œ ๋น” ํƒ์ƒ‰(Decoding) ์ „๋žต์€ ์„ ํƒํ•  ์ˆ˜ ์žˆ๋Š” ๋” ๋‹ค์–‘ํ•œ ๋น” ์‹œํ€€์Šค ์ง‘ํ•ฉ์„ ์ƒ์„ฑํ•  ์ˆ˜ ์žˆ๊ฒŒ ํ•ด์ฃผ๋Š” ๋น” ํƒ์ƒ‰ ์ „๋žต์˜ ํ™•์žฅ์ž…๋‹ˆ๋‹ค. ์ด ๋ฐฉ๋ฒ•์€ ์–ด๋–ป๊ฒŒ ์ž‘๋™ํ•˜๋Š”์ง€ ์•Œ์•„๋ณด๋ ค๋ฉด, [๋‹ค์–‘ํ•œ ๋น” ํƒ์ƒ‰: ์‹ ๊ฒฝ ์‹œํ€€์Šค ๋ชจ๋ธ์—์„œ ๋‹ค์–‘ํ•œ ์†”๋ฃจ์…˜ ๋””์ฝ”๋”ฉํ•˜๊ธฐ](https://huggingface.co/papers/1610.02424)๋ฅผ ์ฐธ์กฐํ•˜์„ธ์š”. ์ด ์ ‘๊ทผ ๋ฐฉ์‹์€ ์„ธ ๊ฐ€์ง€ ์ฃผ์š” ๋งค๊ฐœ๋ณ€์ˆ˜๋ฅผ ๊ฐ€์ง€๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค: `num_beams`, `num_beam_groups`, ๊ทธ๋ฆฌ๊ณ  `diversity_penalty`. ๋‹ค์–‘์„ฑ ํŒจ๋„ํ‹ฐ๋Š” ๊ทธ๋ฃน ๊ฐ„์— ์ถœ๋ ฅ์ด ์„œ๋กœ ๋‹ค๋ฅด๊ฒŒ ํ•˜๊ธฐ ์œ„ํ•œ ๊ฒƒ์ด๋ฉฐ, ๊ฐ ๊ทธ๋ฃน ๋‚ด์—์„œ ๋น” ํƒ์ƒ‰์ด ์‚ฌ์šฉ๋ฉ๋‹ˆ๋‹ค. - -```python ->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - ->>> checkpoint = "google/pegasus-xsum" ->>> prompt = ( -... "The Permaculture Design Principles are a set of universal design principles " -... "that can be applied to any location, climate and culture, and they allow us to design " -... "the most efficient and sustainable human habitation and food production systems. " -... "Permaculture is a design system that encompasses a wide variety of disciplines, such " -... "as ecology, landscape design, environmental science and energy conservation, and the " -... "Permaculture design principles are drawn from these various disciplines. Each individual " -... "design principle itself embodies a complete conceptual framework based on sound " -... "scientific principles. When we bring all these separate principles together, we can " -... "create a design system that both looks at whole systems, the parts that these systems " -... "consist of, and how those parts interact with each other to create a complex, dynamic, " -... "living system. Each design principle serves as a tool that allows us to integrate all " -... "the separate parts of a design, referred to as elements, into a functional, synergistic, " -... "whole system, where the elements harmoniously interact and work together in the most " -... "efficient way possible." -... ) - ->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) ->>> inputs = tokenizer(prompt, return_tensors="pt") - ->>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) - ->>> outputs = model.generate(**inputs, num_beams=5, num_beam_groups=5, max_new_tokens=30, diversity_penalty=1.0) ->>> tokenizer.decode(outputs[0], skip_special_tokens=True) -'The Design Principles are a set of universal design principles that can be applied to any location, climate and -culture, and they allow us to design the' -``` - -์ด ๊ฐ€์ด๋“œ์—์„œ๋Š” ๋‹ค์–‘ํ•œ ๋””์ฝ”๋”ฉ ์ „๋žต์„ ๊ฐ€๋Šฅํ•˜๊ฒŒ ํ•˜๋Š” ์ฃผ์š” ๋งค๊ฐœ๋ณ€์ˆ˜๋ฅผ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค. [`generate`] ๋ฉ”์„œ๋“œ์— ๋Œ€ํ•œ ๊ณ ๊ธ‰ ๋งค๊ฐœ๋ณ€์ˆ˜๊ฐ€ ์กด์žฌํ•˜๋ฏ€๋กœ [`generate`] ๋ฉ”์„œ๋“œ์˜ ๋™์ž‘์„ ๋”์šฑ ์„ธ๋ถ€์ ์œผ๋กœ ์ œ์–ดํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋งค๊ฐœ๋ณ€์ˆ˜์˜ ์ „์ฒด ๋ชฉ๋ก์€ [API ๋ฌธ์„œ](./main_classes/text_generation)๋ฅผ ์ฐธ์กฐํ•˜์„ธ์š”. - ### ์ถ”๋ก  ๋””์ฝ”๋”ฉ(Speculative Decoding)[[speculative-decoding]] ์ถ”๋ก  ๋””์ฝ”๋”ฉ(๋ณด์กฐ ๋””์ฝ”๋”ฉ(assisted decoding)์œผ๋กœ๋„ ์•Œ๋ ค์ง)์€ ๋™์ผํ•œ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์‚ฌ์šฉํ•˜๋Š” ํ›จ์”ฌ ์ž‘์€ ๋ณด์กฐ ๋ชจ๋ธ์„ ํ™œ์šฉํ•˜์—ฌ ๋ช‡ ๊ฐ€์ง€ ํ›„๋ณด ํ† ํฐ์„ ์ƒ์„ฑํ•˜๋Š” ์ƒ์œ„ ๋ชจ๋ธ์˜ ๋””์ฝ”๋”ฉ ์ „๋žต์„ ์ˆ˜์ •ํ•œ ๊ฒƒ์ž…๋‹ˆ๋‹ค. ์ฃผ ๋ชจ๋ธ์€ ๋‹จ์ผ ์ „๋ฐฉ ํ†ต๊ณผ๋กœ ํ›„๋ณด ํ† ํฐ์„ ๊ฒ€์ฆํ•จ์œผ๋กœ์จ ๋””์ฝ”๋”ฉ ๊ณผ์ •์„ ๊ฐ€์†ํ™”ํ•ฉ๋‹ˆ๋‹ค. `do_sample=True`์ผ ๊ฒฝ์šฐ, [์ถ”๋ก  ๋””์ฝ”๋”ฉ ๋…ผ๋ฌธ](https://huggingface.co/papers/2211.17192)์— ์†Œ๊ฐœ๋œ ํ† ํฐ ๊ฒ€์ฆ๊ณผ ์žฌ์ƒ˜ํ”Œ๋ง ๋ฐฉ์‹์ด ์‚ฌ์šฉ๋ฉ๋‹ˆ๋‹ค. diff --git a/docs/source/ko/internal/generation_utils.md b/docs/source/ko/internal/generation_utils.md index bf567920610c..9bd669e34d2b 100644 --- a/docs/source/ko/internal/generation_utils.md +++ b/docs/source/ko/internal/generation_utils.md @@ -131,9 +131,6 @@ generation_output[:2] [[autodoc]] ForcedEOSTokenLogitsProcessor - __call__ -[[autodoc]] HammingDiversityLogitsProcessor - - __call__ - [[autodoc]] InfNanRemoveLogitsProcessor - __call__ @@ -326,10 +323,6 @@ generation_output[:2] - process - finalize -[[autodoc]] BeamSearchScorer - - process - - finalize - [[autodoc]] ConstrainedBeamSearchScorer - process - finalize diff --git a/docs/source/zh/internal/generation_utils.md b/docs/source/zh/internal/generation_utils.md index 084e2a29dc8c..b33ac4be9c92 100644 --- a/docs/source/zh/internal/generation_utils.md +++ b/docs/source/zh/internal/generation_utils.md @@ -133,9 +133,6 @@ generation_output[:2] [[autodoc]] ForcedEOSTokenLogitsProcessor - __call__ -[[autodoc]] HammingDiversityLogitsProcessor - - __call__ - [[autodoc]] InfNanRemoveLogitsProcessor - __call__ @@ -316,10 +313,6 @@ generation_output[:2] - process - finalize -[[autodoc]] BeamSearchScorer - - process - - finalize - [[autodoc]] ConstrainedBeamSearchScorer - process - finalize diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 3349a1698eb8..d0b804beea1c 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -411,7 +411,6 @@ "BayesianDetectorConfig", "BayesianDetectorModel", "BeamScorer", - "BeamSearchScorer", "ClassifierFreeGuidanceLogitsProcessor", "ConstrainedBeamSearchScorer", "Constraint", @@ -426,7 +425,6 @@ "ForcedBOSTokenLogitsProcessor", "ForcedEOSTokenLogitsProcessor", "GenerationMixin", - "HammingDiversityLogitsProcessor", "InfNanRemoveLogitsProcessor", "LogitNormalization", "LogitsProcessor", @@ -656,7 +654,6 @@ from .generation import BayesianDetectorConfig as BayesianDetectorConfig from .generation import BayesianDetectorModel as BayesianDetectorModel from .generation import BeamScorer as BeamScorer - from .generation import BeamSearchScorer as BeamSearchScorer from .generation import ClassifierFreeGuidanceLogitsProcessor as ClassifierFreeGuidanceLogitsProcessor from .generation import CompileConfig as CompileConfig from .generation import ConstrainedBeamSearchScorer as ConstrainedBeamSearchScorer @@ -687,7 +684,6 @@ from .generation import ForcedEOSTokenLogitsProcessor as ForcedEOSTokenLogitsProcessor from .generation import GenerationConfig as GenerationConfig from .generation import GenerationMixin as GenerationMixin - from .generation import HammingDiversityLogitsProcessor as HammingDiversityLogitsProcessor from .generation import InfNanRemoveLogitsProcessor as InfNanRemoveLogitsProcessor from .generation import LogitNormalization as LogitNormalization from .generation import LogitsProcessor as LogitsProcessor diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index a290fcfc733b..d450193dbc2d 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -1121,8 +1121,6 @@ def _get_global_generation_defaults() -> dict[str, Any]: "do_sample": False, "early_stopping": False, "num_beams": 1, - "num_beam_groups": 1, - "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, @@ -1141,6 +1139,9 @@ def _get_global_generation_defaults() -> dict[str, Any]: "exponential_decay_length_penalty": None, "suppress_tokens": None, "begin_suppress_tokens": None, + # Deprecated arguments (moved to the Hub). TODO joao, manuel: remove in v4.62.0 + "num_beam_groups": 1, + "diversity_penalty": 0.0, } def _get_non_default_generation_parameters(self) -> dict[str, Any]: diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py index 31bba4fc83ae..0cef5308b39b 100644 --- a/src/transformers/dynamic_module_utils.py +++ b/src/transformers/dynamic_module_utils.py @@ -428,10 +428,10 @@ def get_cached_module_file( importlib.invalidate_caches() # Make sure we also have every file with relative for module_needed in modules_needed: - if not (submodule_path / f"{module_needed}.py").exists(): + if not ((submodule_path / module_file).parent / f"{module_needed}.py").exists(): get_cached_module_file( pretrained_model_name_or_path, - f"{module_needed}.py", + f"{Path(module_file).parent / module_needed}.py", cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py index 64ebfe6fc7c3..4fb3d32213f8 100644 --- a/src/transformers/generation/__init__.py +++ b/src/transformers/generation/__init__.py @@ -44,7 +44,6 @@ _import_structure["beam_search"] = [ "BeamHypotheses", "BeamScorer", - "BeamSearchScorer", "ConstrainedBeamSearchScorer", ] _import_structure["candidate_generator"] = [ @@ -63,7 +62,6 @@ "ExponentialDecayLengthPenalty", "ForcedBOSTokenLogitsProcessor", "ForcedEOSTokenLogitsProcessor", - "HammingDiversityLogitsProcessor", "InfNanRemoveLogitsProcessor", "LogitNormalization", "LogitsProcessor", @@ -209,7 +207,7 @@ pass else: from .beam_constraints import Constraint, ConstraintListState, DisjunctiveConstraint, PhrasalConstraint - from .beam_search import BeamHypotheses, BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer + from .beam_search import BeamHypotheses, BeamScorer, ConstrainedBeamSearchScorer from .candidate_generator import ( AssistedCandidateGenerator, CandidateGenerator, @@ -227,7 +225,6 @@ ExponentialDecayLengthPenalty, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, - HammingDiversityLogitsProcessor, InfNanRemoveLogitsProcessor, LogitNormalization, LogitsProcessor, diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py index b6647760b790..08af5755e3d7 100644 --- a/src/transformers/generation/beam_search.py +++ b/src/transformers/generation/beam_search.py @@ -45,8 +45,6 @@ The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. beam_indices (`torch.LongTensor`, *optional*): Beam indices indicating to which beam hypothesis each token correspond. - group_index (`int`, *optional*): - The index of the group of beams. Used with [`~PreTrainedModel.group_beam_search`]. Return: `UserDict`: A dictionary composed of the fields as defined above: @@ -120,302 +118,6 @@ def finalize( raise NotImplementedError("This is an abstract method.") -class BeamSearchScorer(BeamScorer): - r""" - [`BeamScorer`] implementing standard beam search decoding. - - Adapted in part from [Facebook's XLM beam search - code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529). - - Reference for the diverse beam search algorithm and implementation [Ashwin Kalyan's DBS - implementation](https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua) - - Args: - batch_size (`int`): - Batch Size of `input_ids` for which standard beam search decoding is run in parallel. - num_beams (`int`): - Number of beams for beam search. - device (`torch.device`): - Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be - allocated. - length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to - the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log - likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while - `length_penalty` < 0.0 encourages shorter sequences. - do_early_stopping (`bool` or `str`, *optional*, defaults to `False`): - Controls the stopping condition for beam-based methods, like beam-search. It accepts the following values: - `True`, where the generation stops as soon as there are `num_beams` complete candidates; `False`, where an - heuristic is applied and the generation stops when is it very unlikely to find better candidates; - `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical - beam search algorithm). - num_beam_hyps_to_keep (`int`, *optional*, defaults to 1): - The number of beam hypotheses that shall be returned upon calling - [`~transformers.BeamSearchScorer.finalize`]. - num_beam_groups (`int`, *optional*, defaults to 1): - Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. - See [this paper](https://huggingface.co/papers/1610.02424) for more details. - max_length (`int`, *optional*): - The maximum length of the sequence to be generated. - """ - - def __init__( - self, - batch_size: int, - num_beams: int, - device: torch.device, - length_penalty: Optional[float] = 1.0, - do_early_stopping: Optional[Union[bool, str]] = False, - num_beam_hyps_to_keep: Optional[int] = 1, - num_beam_groups: Optional[int] = 1, - max_length: Optional[int] = None, - ): - self.num_beams = num_beams - self.device = device - self.length_penalty = length_penalty - self.do_early_stopping = do_early_stopping - self.num_beam_hyps_to_keep = num_beam_hyps_to_keep - self.num_beam_groups = num_beam_groups - self.group_size = self.num_beams // self.num_beam_groups - - self._is_init = False - # self._beam_hyps[i*self.num_beam_groups+j] is the beam_hyps of the j-th group in the i-th mini-batch. - # If group_beam_search is not used, the list consists of `batch_size` beam_hyps. - self._beam_hyps = [ - BeamHypotheses( - num_beams=self.group_size, - length_penalty=self.length_penalty, - early_stopping=self.do_early_stopping, - max_length=max_length, - ) - for _ in range(batch_size * self.num_beam_groups) - ] - # self._done[i*self.num_beam_groups+j] indicates whether the generation of the beam_hyps of the j-th group - # in the i-th mini-batch is complete. - self._done = torch.tensor( - [False for _ in range(batch_size * self.num_beam_groups)], dtype=torch.bool, device=self.device - ) - - if not isinstance(num_beams, int) or num_beams <= 1: - raise ValueError( - f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1," - " one should make use of `greedy_search` instead." - ) - - if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0): - raise ValueError( - "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be" - f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}." - ) - - @property - def is_done(self) -> bool: - return self._done.all() - - def process( - self, - input_ids: torch.LongTensor, - next_scores: torch.FloatTensor, - next_tokens: torch.LongTensor, - next_indices: torch.LongTensor, - pad_token_id: Optional[Union[int, torch.Tensor]] = None, - eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, - beam_indices: Optional[torch.LongTensor] = None, - group_index: Optional[int] = 0, - decoder_prompt_len: Optional[int] = 0, - ) -> dict[str, torch.Tensor]: - # add up to the length which the next_scores is calculated on (including decoder prompt) - cur_len = input_ids.shape[-1] + 1 - batch_size = len(self._beam_hyps) // self.num_beam_groups - - if batch_size != (input_ids.shape[0] // self.group_size): - if self.num_beam_groups > 1: - raise ValueError( - f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam " - f"size of {self.group_size} is expected by the beam scorer." - ) - else: - raise ValueError( - f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of " - f"{self.group_size} is expected by the beam scorer." - ) - - device = input_ids.device - next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device) - next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device) - next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device) - - if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor): - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - eos_token_id = torch.tensor(eos_token_id) - - for batch_idx in range(batch_size): - batch_group_idx = batch_idx * self.num_beam_groups + group_index - if self._done[batch_group_idx]: - if self.num_beams < len(self._beam_hyps[batch_group_idx]): - raise ValueError(f"Batch can only be done if at least {self.num_beams} beams have been generated") - if eos_token_id is None or pad_token_id is None: - raise ValueError("Generated beams >= num_beams -> eos_token_id and pad_token have to be defined") - # pad the batch - next_beam_scores[batch_idx, :] = 0 - next_beam_tokens[batch_idx, :] = pad_token_id - next_beam_indices[batch_idx, :] = 0 - continue - - # next tokens for this sentence - beam_idx = 0 - for beam_token_rank, (next_token, next_score, next_index) in enumerate( - zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx]) - ): - batch_beam_idx = batch_idx * self.group_size + next_index - # add to generated hypotheses if end of sentence - if (eos_token_id is not None) and (next_token.item() in eos_token_id): - # if beam_token does not belong to top num_beams tokens, it should not be added - is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size - if is_beam_token_worse_than_top_num_beams: - continue - if beam_indices is not None: - beam_index = beam_indices[batch_beam_idx] - beam_index = beam_index + (batch_beam_idx,) - else: - beam_index = None - - self._beam_hyps[batch_group_idx].add( - input_ids[batch_beam_idx].clone(), - next_score.item(), - beam_indices=beam_index, - generated_len=cur_len - decoder_prompt_len, - ) - else: - # add next predicted token since it is not eos_token - next_beam_scores[batch_idx, beam_idx] = next_score - next_beam_tokens[batch_idx, beam_idx] = next_token - next_beam_indices[batch_idx, beam_idx] = batch_beam_idx - beam_idx += 1 - - # once the beam for next step is full, don't add more tokens to it. - if beam_idx == self.group_size: - break - - if beam_idx < self.group_size: - raise ValueError( - f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:" - f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected." - ) - - # Check if we are done so that we can save a pad step if all(done) - self._done[batch_group_idx] = self._done[batch_group_idx] or self._beam_hyps[batch_group_idx].is_done( - next_scores[batch_idx].max().item(), cur_len, decoder_prompt_len - ) - - return UserDict( - { - "next_beam_scores": next_beam_scores.view(-1), - "next_beam_tokens": next_beam_tokens.view(-1), - "next_beam_indices": next_beam_indices.view(-1), - } - ) - - def finalize( - self, - input_ids: torch.LongTensor, - final_beam_scores: torch.FloatTensor, - final_beam_tokens: torch.LongTensor, - final_beam_indices: torch.LongTensor, - max_length: int, - pad_token_id: Optional[Union[int, torch.Tensor]] = None, - eos_token_id: Optional[Union[int, list[int], torch.Tensor]] = None, - beam_indices: Optional[torch.LongTensor] = None, - decoder_prompt_len: Optional[int] = 0, - ) -> tuple[torch.LongTensor]: - batch_size = len(self._beam_hyps) // self.num_beam_groups - - if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor): - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - eos_token_id = torch.tensor(eos_token_id) - - # finalize all open beam hypotheses and add to generated hypotheses - for batch_group_idx, beam_hyp in enumerate(self._beam_hyps): - if self._done[batch_group_idx]: - continue - - # all open beam hypotheses are added to the beam hypothesis - # beam hypothesis class automatically keeps the best beams - for index_per_group in range(self.group_size): - batch_beam_idx = batch_group_idx * self.group_size + index_per_group - final_score = final_beam_scores[batch_beam_idx].item() - final_tokens = input_ids[batch_beam_idx] - beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None - generated_len = final_tokens.shape[-1] - decoder_prompt_len - beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len) - - # select the best hypotheses - sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep) - best = [] - best_indices = [] - best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32) - - # retrieve best hypotheses - for i in range(batch_size): - beam_hyps_in_batch = self._beam_hyps[i * self.num_beam_groups : (i + 1) * self.num_beam_groups] - candidate_beams = [beam for beam_hyp in beam_hyps_in_batch for beam in beam_hyp.beams] - sorted_hyps = sorted(candidate_beams, key=lambda x: x[0]) - for j in range(self.num_beam_hyps_to_keep): - best_hyp_tuple = sorted_hyps.pop() - best_score = best_hyp_tuple[0] - best_hyp = best_hyp_tuple[1] - best_index = best_hyp_tuple[2] - sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp) - - # append hyp to lists - best.append(best_hyp) - - # append indices to list - best_indices.append(best_index) - - best_scores[i * self.num_beam_hyps_to_keep + j] = best_score - - # prepare for adding eos - sent_lengths_max = sent_lengths.max().item() + 1 - sent_max_len = min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max - decoded: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len) - - if len(best_indices) > 0 and best_indices[0] is not None: - indices: torch.LongTensor = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len) - else: - indices = None - - # shorter batches are padded if needed - if sent_lengths.min().item() != sent_lengths.max().item(): - if pad_token_id is None: - raise ValueError("`pad_token_id` has to be defined") - decoded.fill_(pad_token_id) - - if indices is not None: - indices.fill_(-1) - - # fill with hypotheses and eos_token_id if the latter fits in - for i, (hypo, best_idx) in enumerate(zip(best, best_indices)): - decoded[i, : sent_lengths[i]] = hypo - - if indices is not None: - indices[i, : len(best_idx)] = torch.tensor(best_idx) - - if sent_lengths[i] < sent_max_len: - # inserting only the first eos_token_id - decoded[i, sent_lengths[i]] = eos_token_id[0] - - return UserDict( - { - "sequences": decoded, - "sequence_scores": best_scores, - "beam_indices": indices, - } - ) - - class ConstrainedBeamSearchScorer(BeamScorer): r""" [`BeamScorer`] implementing constrained beam search decoding. @@ -446,9 +148,6 @@ class ConstrainedBeamSearchScorer(BeamScorer): num_beam_hyps_to_keep (`int`, *optional*, defaults to 1): The number of beam hypotheses that shall be returned upon calling [`~transformers.BeamSearchScorer.finalize`]. - num_beam_groups (`int`, *optional*, defaults to 1): - Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. - See [this paper](https://huggingface.co/papers/1610.02424) for more details. max_length (`int`, *optional*): The maximum length of the sequence to be generated. """ @@ -462,7 +161,6 @@ def __init__( length_penalty: Optional[float] = 1.0, do_early_stopping: Optional[Union[bool, str]] = False, num_beam_hyps_to_keep: Optional[int] = 1, - num_beam_groups: Optional[int] = 1, max_length: Optional[int] = None, ): self.num_beams = num_beams @@ -470,8 +168,6 @@ def __init__( self.length_penalty = length_penalty self.do_early_stopping = do_early_stopping self.num_beam_hyps_to_keep = num_beam_hyps_to_keep - self.num_beam_groups = num_beam_groups - self.group_size = self.num_beams // self.num_beam_groups self.constraints = constraints self._is_init = False @@ -492,12 +188,6 @@ def __init__( " one should make use of `greedy_search` instead." ) - if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0): - raise ValueError( - "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` has to be" - f" divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}." - ) - @property def is_done(self) -> bool: return self._done.all() @@ -564,23 +254,12 @@ def process( # add up to the length which the next_scores is calculated on (including decoder prompt) cur_len = input_ids.shape[-1] + 1 batch_size = len(self._beam_hyps) - if batch_size != (input_ids.shape[0] // self.group_size): - if self.num_beam_groups > 1: - raise ValueError( - f"A group beam size of {input_ids.shape[0]} is used as the input, but a group beam " - f"size of {self.group_size} is expected by the beam scorer." - ) - else: - raise ValueError( - f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of " - f"{self.group_size} is expected by the beam scorer." - ) device = input_ids.device - next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device) - next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device) - next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device) + next_beam_scores = torch.zeros((batch_size, self.num_beams), dtype=next_scores.dtype, device=device) + next_beam_tokens = torch.zeros((batch_size, self.num_beams), dtype=next_tokens.dtype, device=device) + next_beam_indices = torch.zeros((batch_size, self.num_beams), dtype=next_indices.dtype, device=device) if eos_token_id is not None and not isinstance(eos_token_id, torch.Tensor): if isinstance(eos_token_id, int): @@ -604,11 +283,11 @@ def process( for beam_token_rank, (next_token, next_score, next_index) in enumerate( zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx]) ): - batch_beam_idx = batch_idx * self.group_size + next_index + batch_beam_idx = batch_idx * self.num_beams + next_index # add to generated hypotheses if end of sentence if (eos_token_id is not None) and (next_token.item() in eos_token_id): # if beam_token does not belong to top num_beams tokens, it should not be added - is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size + is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.num_beams if is_beam_token_worse_than_top_num_beams: continue @@ -634,7 +313,7 @@ def process( beam_idx += 1 # once the beam for next step is full, don't add more tokens to it. - if beam_idx == self.group_size: + if beam_idx == self.num_beams: break new_scores, new_tokens, new_indices = self.step_sentence_constraint( @@ -650,9 +329,9 @@ def process( next_beam_tokens[batch_idx] = new_tokens next_beam_indices[batch_idx] = new_indices - if beam_idx < self.group_size: + if beam_idx < self.num_beams: raise ValueError( - f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:" + f"At most {self.num_beams} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id:" f" {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected." ) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 1edaf19948e8..177fa8064857 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -89,7 +89,6 @@ class GenerationConfig(PushToHubMixin): - *multinomial sampling* if `num_beams=1` and `do_sample=True` - *beam-search decoding* if `num_beams>1` and `do_sample=False` - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True` - - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1` - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None` - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()` @@ -134,9 +133,6 @@ class GenerationConfig(PushToHubMixin): Whether or not to use sampling ; use greedy decoding otherwise. num_beams (`int`, *optional*, defaults to 1): Number of beams for beam search. 1 means no beam search. - num_beam_groups (`int`, *optional*, defaults to 1): - Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. - [this paper](https://huggingface.co/papers/1610.02424) for more details. > Parameters that control the cache @@ -190,9 +186,6 @@ class GenerationConfig(PushToHubMixin): probability, scaled by `sqrt(eta_cutoff)`. In the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model. See [Truncation Sampling as Language Model Desmoothing](https://huggingface.co/papers/2210.15191) for more details. - diversity_penalty (`float`, *optional*, defaults to 0.0): - This value is subtracted from a beam's score if it generates a token same as any beam from other group at a - particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. repetition_penalty (`float`, *optional*, defaults to 1.0): The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://huggingface.co/papers/1909.05858) for more details. @@ -359,7 +352,6 @@ def __init__(self, **kwargs): # Parameters that control the generation strategy used self.do_sample = kwargs.pop("do_sample", False) self.num_beams = kwargs.pop("num_beams", 1) - self.num_beam_groups = kwargs.pop("num_beam_groups", 1) # Parameters that control the cache self.use_cache = kwargs.pop("use_cache", True) @@ -377,7 +369,6 @@ def __init__(self, **kwargs): self.typical_p = kwargs.pop("typical_p", 1.0) self.epsilon_cutoff = kwargs.pop("epsilon_cutoff", 0.0) self.eta_cutoff = kwargs.pop("eta_cutoff", 0.0) - self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0) self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0) self.encoder_repetition_penalty = kwargs.pop("encoder_repetition_penalty", 1.0) self.length_penalty = kwargs.pop("length_penalty", 1.0) @@ -441,6 +432,8 @@ def __init__(self, **kwargs): self.low_memory = kwargs.pop("low_memory", None) self.penalty_alpha = kwargs.pop("penalty_alpha", None) self.dola_layers = kwargs.pop("dola_layers", None) + self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0) + self.num_beam_groups = kwargs.pop("num_beam_groups", 1) # The remaining attributes do not parametrize `.generate()`, but are informative and/or used by the hub # interface. @@ -628,14 +621,6 @@ def validate(self, strict=False): minor_issues["early_stopping"] = single_beam_wrong_parameter_msg.format( flag_name="early_stopping", flag_value=self.early_stopping ) - if self.num_beam_groups is not None and self.num_beam_groups != 1: - minor_issues["num_beam_groups"] = single_beam_wrong_parameter_msg.format( - flag_name="num_beam_groups", flag_value=self.num_beam_groups - ) - if self.diversity_penalty is not None and self.diversity_penalty != 0.0: - minor_issues["diversity_penalty"] = single_beam_wrong_parameter_msg.format( - flag_name="diversity_penalty", flag_value=self.diversity_penalty - ) if self.length_penalty is not None and self.length_penalty != 1.0: minor_issues["length_penalty"] = single_beam_wrong_parameter_msg.format( flag_name="length_penalty", flag_value=self.length_penalty @@ -658,27 +643,6 @@ def validate(self, strict=False): raise ValueError( constrained_wrong_parameter_msg.format(flag_name="do_sample", flag_value=self.do_sample) ) - if self.num_beam_groups is not None and self.num_beam_groups != 1: - raise ValueError( - constrained_wrong_parameter_msg.format( - flag_name="num_beam_groups", flag_value=self.num_beam_groups - ) - ) - # group beam search - elif self.diversity_penalty != 0.0 or self.num_beam_groups != 1: - group_error_prefix = ( - "`diversity_penalty` is not 0.0 or `num_beam_groups` is not 1, triggering group beam search. In " - "this generation mode, " - ) - if self.do_sample is True: - raise ValueError(group_error_prefix + "`do_sample` must be set to `False`") - if self.num_beams % self.num_beam_groups != 0: - raise ValueError(group_error_prefix + "`num_beams` should be divisible by `num_beam_groups`") - if self.diversity_penalty == 0.0: - raise ValueError( - group_error_prefix - + "`diversity_penalty` should be greater than `0.0`, otherwise your groups will be identical." - ) # 2.4. check `num_return_sequences` if self.num_return_sequences != 1: diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index 14b4b54aa1c5..abc08ef2eb5c 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1441,142 +1441,6 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to return scores_processed -class HammingDiversityLogitsProcessor(LogitsProcessor): - r""" - [`LogitsProcessor`] that enforces diverse beam search. - - Note that this logits processor is only effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam - Search: Decoding Diverse Solutions from Neural Sequence Models](https://huggingface.co/papers/1610.02424) for more - details. - - Traditional beam search often generates very similar sequences across different beams. - `HammingDiversityLogitsProcessor` addresses this by penalizing beams that generate tokens already chosen by other - beams in the same time step. - - Args: - diversity_penalty (`float`): - This value is subtracted from a beam's score if it generates a token same as any beam from other group at a - particular time. A higher `diversity_penalty` will enforce greater diversity among the beams. Adjusting - this value can help strike a balance between diversity and natural likelihood. - num_beams (`int`): - Number of beams for beam search. 1 means no beam search. - num_beam_groups (`int`): - Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. - [this paper](https://huggingface.co/papers/1610.02424) for more details. - - Examples: - - ```python - >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - >>> import torch - - >>> # Initialize the model and tokenizer - >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base") - - >>> # A long text about the solar system - >>> text = ( - ... "The Solar System is a gravitationally bound system comprising the Sun and the objects that orbit it, " - ... "either directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight " - ... "planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System " - ... "bodies. The Solar System formed 4.6 billion years ago from the gravitational collapse of a giant " - ... "interstellar molecular cloud." - ... ) - >>> inputs = tokenizer("summarize: " + text, return_tensors="pt") - - >>> # Generate diverse summary - >>> outputs_diverse = model.generate( - ... **inputs, - ... num_beam_groups=2, - ... diversity_penalty=10.0, - ... max_length=100, - ... num_beams=4, - ... num_return_sequences=2, - ... ) - >>> summaries_diverse = tokenizer.batch_decode(outputs_diverse, skip_special_tokens=True) - - >>> # Generate non-diverse summary - >>> outputs_non_diverse = model.generate( - ... **inputs, - ... max_length=100, - ... num_beams=4, - ... num_return_sequences=2, - ... ) - >>> summary_non_diverse = tokenizer.batch_decode(outputs_non_diverse, skip_special_tokens=True) - - >>> # With `diversity_penalty`, the resulting beams are much more diverse - >>> print(summary_non_diverse) - ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.', - 'the Solar System formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.'] - - >>> print(summaries_diverse) - ['the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets.', - 'the solar system formed 4.6 billion years ago from the collapse of a giant interstellar molecular cloud. of the objects that orbit the Sun directly, the largest are the eight planets. the rest of the objects are smaller objects, such as the five dwarf planets and small solar system bodies.'] - ``` - """ - - def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int): - if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0): - raise ValueError("`diversity_penalty` should be a float strictly larger than 0.") - self._diversity_penalty = diversity_penalty - if not isinstance(num_beams, int) or num_beams < 2: - raise ValueError("`num_beams` should be an integer strictly larger than 1.") - self._num_beams = num_beams - if not isinstance(num_beam_groups, int) or num_beam_groups < 2: - raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.") - if num_beam_groups > num_beams: - raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.") - self._num_sub_beams = num_beams // num_beam_groups - - def __call__( - self, - input_ids: torch.LongTensor, - scores: torch.FloatTensor, - current_tokens: torch.LongTensor, - beam_group_idx: int, - ) -> torch.FloatTensor: - r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) - scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`): - Prediction scores of a language modeling head. These can be logits for each vocabulary when not using - beam search or log softmax for each vocabulary token when using beam search - current_tokens (`torch.LongTensor` of shape `(batch_size)`): - Indices of input sequence tokens in the vocabulary, corresponding to the tokens selected by the other - beam groups in the current generation step. - beam_group_idx (`int`): - The index of the beam group currently being processed. - - Return: - `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: - The processed prediction scores. - """ - # hamming diversity: penalise using same token in current group which was used in previous groups at - # the same time step - batch_size = current_tokens.shape[0] // self._num_beams - group_start_idx = beam_group_idx * self._num_sub_beams - group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams) - group_size = group_end_idx - group_start_idx - vocab_size = scores.shape[-1] - - if group_start_idx == 0: - return scores - - scores_processed = scores.clone() - for batch_idx in range(batch_size): - # predicted tokens of last time step of previous groups - previous_group_tokens = current_tokens[ - batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx - ] - token_frequency = torch.bincount(previous_group_tokens, minlength=vocab_size).to(scores.device) - scores_processed[batch_idx * group_size : (batch_idx + 1) * group_size] -= ( - self._diversity_penalty * token_frequency - ) - - return scores_processed - - class ForcedBOSTokenLogitsProcessor(LogitsProcessor): r""" [`LogitsProcessor`] that enforces the specified token as the first generated token. Used with encoder-decoder diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index e03ad600deb3..68db1406d67d 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -54,7 +54,7 @@ logging, ) from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint -from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer +from .beam_search import ConstrainedBeamSearchScorer from .candidate_generator import ( AssistantVocabTranslatorCache, AssistedCandidateGenerator, @@ -82,7 +82,6 @@ ExponentialDecayLengthPenalty, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, - HammingDiversityLogitsProcessor, InfNanRemoveLogitsProcessor, LogitNormalization, LogitsProcessorList, @@ -371,7 +370,6 @@ class GenerationMixin(ContinuousMixin): - *multinomial sampling* if `num_beams=1` and `do_sample=True` - *beam-search decoding* if `num_beams>1` and `do_sample=False` - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True` - - *diverse beam-search decoding* if `num_beams>1` and `num_beam_groups>1` - *constrained beam-search decoding* if `constraints!=None` or `force_words_ids!=None` - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()` @@ -1114,14 +1112,6 @@ def _get_logits_processor( if generation_config.sequence_bias is not None: processors.append(SequenceBiasLogitsProcessor(sequence_bias=generation_config.sequence_bias)) - if generation_config.diversity_penalty is not None and generation_config.diversity_penalty > 0.0: - processors.append( - HammingDiversityLogitsProcessor( - diversity_penalty=generation_config.diversity_penalty, - num_beams=generation_config.num_beams, - num_beam_groups=generation_config.num_beam_groups, - ) - ) if ( generation_config.encoder_repetition_penalty is not None and generation_config.encoder_repetition_penalty != 1.0 @@ -1196,7 +1186,7 @@ def _get_logits_processor( processors.append( PrefixConstrainedLogitsProcessor( prefix_allowed_tokens_fn, - generation_config.num_beams // generation_config.num_beam_groups, + generation_config.num_beams, ) ) if generation_config.forced_bos_token_id is not None: @@ -2559,28 +2549,22 @@ def generate( elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH: logger.warning_once( - "Group Beam Search is scheduled to be moved to a `custom_generate` repository in v4.55.0. " - "To prevent loss of backward compatibility, add `trust_remote_code=True` to your `generate` call." - ) - # 11. prepare beam search scorer - beam_scorer = BeamSearchScorer( - batch_size=batch_size, - num_beams=generation_config.num_beams, - device=inputs_tensor.device, - length_penalty=generation_config.length_penalty, - do_early_stopping=generation_config.early_stopping, - num_beam_hyps_to_keep=generation_config.num_return_sequences, - num_beam_groups=generation_config.num_beam_groups, - max_length=generation_config.max_length, + "Group Beam Search was moved to a `custom_generate` repo: https://hf.co/transformers-community/group-beam-search. " + "To prevent loss of backward compatibility, add `custom_generate='transformers-community/group-beam-search'` " + "to your `generate` call before v4.62.0." ) - result = self._group_beam_search( - input_ids, - beam_scorer, - logits_processor=prepared_logits_processor, - stopping_criteria=prepared_stopping_criteria, + if not trust_remote_code: + raise ValueError( + "Group Beam Search requires `trust_remote_code=True` in your `generate` call, since " + "it loads https://hf.co/transformers-community/group-beam-search." + ) + return GenerationMixin.generate( + self, + inputs, + custom_generate="transformers-community/group-beam-search", generation_config=generation_config, - synced_gpus=synced_gpus, - **model_kwargs, + trust_remote_code=trust_remote_code, + **kwargs, ) elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH: @@ -3527,301 +3511,6 @@ def _beam_search( else: return sequences - def _group_beam_search( - self, - input_ids: torch.LongTensor, - beam_scorer: BeamScorer, - logits_processor: LogitsProcessorList, - stopping_criteria: StoppingCriteriaList, - generation_config: GenerationConfig, - synced_gpus: bool, - **model_kwargs, - ): - r""" - Generates sequences of token ids for models with a language modeling head using **diverse beam search - decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`): - The sequence used as a prompt for the generation. - beam_scorer (`BeamScorer`): - An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and - sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. - logits_processor (`LogitsProcessorList`): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - generation_config ([`~generation.GenerationConfig`]): - The generation configuration to be used as parametrization of the decoding method. - synced_gpus (`bool`): - Whether to continue running the while loop until max_length (needed to avoid deadlocking with - `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3). - model_kwargs: - Additional model specific kwargs that will be forwarded to the `forward` function of the model. If - model is an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - """ - # init values - pad_token_id = generation_config._pad_token_tensor - eos_token_id = generation_config._eos_token_tensor - output_attentions = generation_config.output_attentions - output_hidden_states = generation_config.output_hidden_states - output_scores = generation_config.output_scores - output_logits = generation_config.output_logits - return_dict_in_generate = generation_config.return_dict_in_generate - - num_beams = beam_scorer.num_beams - num_beam_groups = beam_scorer.num_beam_groups - num_sub_beams = num_beams // num_beam_groups - batch_size = len(beam_scorer._beam_hyps) // num_beam_groups - device = input_ids.device - - batch_beam_size, cur_len = input_ids.shape - model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs) - - if return_dict_in_generate and output_scores: - beam_indices = [tuple(() for _ in range(num_sub_beams * batch_size)) for _ in range(num_beam_groups)] - else: - beam_indices = None - - if num_beams * batch_size != batch_beam_size: - raise ValueError( - f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - raw_logits = () if (return_dict_in_generate and output_logits) else None - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) - - # initialise score of first beam of each group with 0 and the rest with -1e9. This ensures that the beams in - # the same group don't produce same tokens every time. - beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device) - beam_scores[:, ::num_sub_beams] = 0 - beam_scores = beam_scores.view((batch_size * num_beams,)) - - this_peer_finished = False - - decoder_prompt_len = input_ids.shape[1] # record the prompt length of decoder - while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): - # predicted tokens in cur_len step - current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device) - - # indices which will form the beams in the next time step - reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device) - - # do one decoder step on all beams of all sentences in batch - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # prepare variable output controls (note: some models won't accept all output controls) - model_inputs.update({"output_attentions": output_attentions} if output_attentions else {}) - model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {}) - - outputs = self(**model_inputs, return_dict=True) - - # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping - model_kwargs = self._update_model_kwargs_for_generation( - outputs, - model_kwargs, - is_encoder_decoder=self.config.is_encoder_decoder, - ) - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue - - if output_scores: - processed_score = torch.zeros_like(outputs.logits[:, -1, :]) - if output_logits: - # Copy is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration - # (the clone itself is always small) - raw_logit_score = outputs.logits[:, -1, :].to(copy=True, device=input_ids.device) - - for beam_group_idx in range(num_beam_groups): - group_start_idx = beam_group_idx * num_sub_beams - group_end_idx = min(group_start_idx + num_sub_beams, num_beams) - group_size = group_end_idx - group_start_idx - - # indices of beams of current group among all sentences in batch - batch_group_indices = [] - - for batch_idx in range(batch_size): - batch_group_indices.extend( - [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)] - ) - group_input_ids = input_ids[batch_group_indices] - - # select outputs of beams of current group only - # No need to clone() the logits here as they will not retain outputs.logits at the end of the loop - # .float() is needed to retain precision for later logits manipulations - next_token_logits = outputs.logits[batch_group_indices, -1, :].to( - dtype=torch.float32, device=input_ids.device - ) - - next_token_scores = nn.functional.log_softmax( - next_token_logits, dim=-1 - ) # (batch_size * group_size, vocab_size) - vocab_size = next_token_scores.shape[-1] - - next_token_scores_processed = logits_processor( - group_input_ids, next_token_scores, current_tokens=current_tokens, beam_group_idx=beam_group_idx - ) - next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1) - next_token_scores = next_token_scores.expand_as(next_token_scores_processed) - - if output_scores: - processed_score[batch_group_indices] = next_token_scores_processed - - # reshape for beam search - next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size) - - # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam. - n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0 - next_token_scores, next_tokens = torch.topk( - next_token_scores, max(2, 1 + n_eos_tokens) * group_size, dim=1, largest=True, sorted=True - ) - - next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") - next_tokens = next_tokens % vocab_size - - # stateless - process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None - beam_outputs = beam_scorer.process( - group_input_ids, - next_token_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - beam_indices=process_beam_indices, - group_index=beam_group_idx, - decoder_prompt_len=decoder_prompt_len, - ) - beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - - if return_dict_in_generate and output_scores: - beam_indices[beam_group_idx] = tuple( - beam_indices[beam_group_idx][beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices[0])) - ) - - input_ids[batch_group_indices] = group_input_ids[beam_idx] - group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) - current_tokens[batch_group_indices] = group_input_ids[:, -1] - - # (beam_idx // group_size) -> batch_idx - # (beam_idx % group_size) -> offset of idx inside the group - reordering_indices[batch_group_indices] = ( - num_beams * torch.div(beam_idx, group_size, rounding_mode="floor") - + group_start_idx - + (beam_idx % group_size) - ) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (processed_score,) - if output_logits: - raw_logits += (raw_logit_score,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1) - - # This is needed to properly delete outputs.logits which may be very large for first iteration - # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration - # IMPORTANT: Note that this should appear BEFORE the call to _reorder_cache() to save the maximum memory - # (that way the memory peak does not include outputs.logits) - del outputs - - # NOTE: we need to check if `self._reorder_cache` exists for special models like RAG, RecurrentGemma etc. - if model_kwargs.get("past_key_values", None) is not None: - if hasattr(self, "_reorder_cache"): - model_kwargs["past_key_values"] = self._reorder_cache( - model_kwargs["past_key_values"], reordering_indices - ) - else: - model_kwargs["past_key_values"].reorder_cache(reordering_indices) - - # increase cur_len - cur_len = cur_len + 1 - - if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)): - this_peer_finished = True - - final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None - sequence_outputs = beam_scorer.finalize( - input_ids, - beam_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, - beam_indices=final_beam_indices, - decoder_prompt_len=decoder_prompt_len, - ) - - if return_dict_in_generate: - if not output_scores: - sequence_outputs["sequence_scores"] = None - - if self.config.is_encoder_decoder: - return GenerateBeamEncoderDecoderOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - logits=raw_logits, - beam_indices=sequence_outputs["beam_indices"], - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - past_key_values=model_kwargs.get("past_key_values"), - ) - else: - return GenerateBeamDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - logits=raw_logits, - beam_indices=sequence_outputs["beam_indices"], - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - past_key_values=model_kwargs.get("past_key_values"), - ) - else: - return sequence_outputs["sequences"] - def _constrained_beam_search( self, input_ids: torch.LongTensor, diff --git a/src/transformers/models/dia/generation_dia.py b/src/transformers/models/dia/generation_dia.py index 7cac22f0d483..22b607ec2865 100644 --- a/src/transformers/models/dia/generation_dia.py +++ b/src/transformers/models/dia/generation_dia.py @@ -400,7 +400,7 @@ def _main_generate_loop( else: raise ValueError( "Got incompatible mode for generation, should be one of greedy or sampling. " - "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`." + "Ensure that beam search is de-activated by setting `num_beams=1`." ) @torch.no_grad() diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py index 8ee43bd29184..d241268bd3cd 100644 --- a/src/transformers/models/janus/modeling_janus.py +++ b/src/transformers/models/janus/modeling_janus.py @@ -1270,7 +1270,7 @@ def generate( if generation_config.get_generation_mode() not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH): raise ValueError( "Got incompatible mode for Image Generation, should be one of greedy or sampling. " - "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`." + "Ensure that beam search is de-activated by setting `num_beams=1`." ) # Validate the configuration and model kwargs diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index 2ae65710b2a5..0d15572b527b 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -1130,7 +1130,7 @@ def generate( if generation_config.get_generation_mode() not in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH): raise ValueError( "Got incompatible mode for Image Generation, should be one of greedy or sampling. " - "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`." + "Ensure that beam search is de-activated by setting `num_beams=1`." ) # Validate the configuration and model kwargs diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index c8df024e4e5a..d91a198da816 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1321,7 +1321,7 @@ def generate( else: raise ValueError( "Got incompatible mode for generation, should be one of greedy or sampling. " - "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`." + "Ensure that beam search is de-activated by setting `num_beams=1`." ) if generation_config.return_dict_in_generate: @@ -2371,7 +2371,7 @@ def generate( else: raise ValueError( "Got incompatible mode for generation, should be one of greedy or sampling. " - "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`." + "Ensure that beam search is de-activated by setting `num_beams=1`." ) if generation_config.return_dict_in_generate: diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index 28ef14ee15fe..ec544cb88804 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1235,7 +1235,7 @@ def generate( else: raise ValueError( "Got incompatible mode for generation, should be one of greedy or sampling. " - "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`." + "Ensure that beam search is de-activated by setting `num_beams=1`." ) if generation_config.return_dict_in_generate: @@ -2236,7 +2236,7 @@ def generate( else: raise ValueError( "Got incompatible mode for generation, should be one of greedy or sampling. " - "Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`." + "Ensure that beam search is de-activated by setting `num_beams=1`." ) if generation_config.return_dict_in_generate: diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index c892fb365cbc..ce3fd036aef2 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -177,13 +177,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class BeamSearchScorer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class ClassifierFreeGuidanceLogitsProcessor(metaclass=DummyObject): _backends = ["torch"] @@ -282,13 +275,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class HammingDiversityLogitsProcessor(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class InfNanRemoveLogitsProcessor(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/generation/test_beam_search.py b/tests/generation/test_beam_search.py index bd791bbc8fb8..69bb0a40e292 100644 --- a/tests/generation/test_beam_search.py +++ b/tests/generation/test_beam_search.py @@ -26,230 +26,12 @@ from transformers.generation import ( BeamHypotheses, - BeamSearchScorer, ConstrainedBeamSearchScorer, DisjunctiveConstraint, PhrasalConstraint, ) -class BeamSearchTester: - def __init__( - self, - parent, - batch_size=3, - sequence_length=10, - vocab_size=99, - pad_token_id=0, - max_length=20, - num_beams=4, - length_penalty=2.0, - do_early_stopping=True, - num_beam_hyps_to_keep=2, - ): - self.parent = parent - self.batch_size = batch_size - self.sequence_length = sequence_length - self.vocab_size = vocab_size - self.pad_token_id = pad_token_id - self.max_length = max_length - self.num_beams = num_beams - self.length_penalty = length_penalty - self.do_early_stopping = do_early_stopping - self.num_beam_hyps_to_keep = num_beam_hyps_to_keep - - # cannot be randomly generated - self.eos_token_id = vocab_size + 1 - - def prepare_beam_scorer(self, **kwargs): - return BeamSearchScorer( - batch_size=kwargs.get("batch_size", self.batch_size), - num_beams=kwargs.get("num_beams", self.num_beams), - device=torch_device, - length_penalty=kwargs.get("length_penalty", self.length_penalty), - do_early_stopping=kwargs.get("do_early_stopping", self.do_early_stopping), - num_beam_hyps_to_keep=kwargs.get("num_beam_hyps_to_keep", self.num_beam_hyps_to_keep), - ) - - def prepare_inputs(self): - input_ids = ids_tensor((self.batch_size * self.num_beams, self.sequence_length), self.vocab_size) - next_tokens = ids_tensor((self.batch_size, 2 * self.num_beams), self.vocab_size).to(torch_device) - next_indices = ids_tensor((self.batch_size, 2 * self.num_beams), self.num_beams).to(torch_device) - next_scores, _ = (-floats_tensor((self.batch_size, 2 * self.num_beams)).to(torch_device)).sort(descending=True) - return (input_ids, next_tokens, next_indices, next_scores) - - def check_beam_hypotheses(self, input_ids, *args): - # check that correct number of beam hypotheses is set in beam scorer - beam_scorer = self.prepare_beam_scorer(do_early_stopping=True) - beam_hyp = beam_scorer._beam_hyps[0] - - self.parent.assertEqual(len(beam_scorer._beam_hyps), self.batch_size) - - # check correct type - self.parent.assertTrue(isinstance(beam_hyp, BeamHypotheses)) - - # check that num_beams is correctly set - self.parent.assertEqual(beam_hyp.num_beams, self.num_beams) - - # check for early stopping deactivated - for beam_idx in range(self.num_beams): - beam_hyp.add(input_ids[beam_idx], -10.0) - - # if early stopping True -> score does not matter - self.parent.assertTrue(beam_hyp.is_done(-10.0, 5)) - - # re-init - beam_scorer = self.prepare_beam_scorer(do_early_stopping=False) - beam_hyp = beam_scorer._beam_hyps[0] - - # add `num_beams + 1` beams to change `worst_score` - for beam_idx in range(self.num_beams + 1): - beam_hyp.add(input_ids[beam_idx], -10.0 + float(beam_idx)) - - # -10.0 is removed => -9.0 is worst score - self.parent.assertAlmostEqual(beam_hyp.worst_score, -9.0 / (self.sequence_length**beam_hyp.length_penalty)) - - # -5.0 is better than worst score => should not be finished - self.parent.assertFalse(beam_hyp.is_done(-5.0, self.sequence_length)) - - # -20.0 is worse than worst score => should be finished - self.parent.assertTrue(beam_hyp.is_done(-20.0, self.sequence_length)) - - def check_beam_scorer_update(self, input_ids, next_tokens, next_indices, next_scores): - # check too many eos tokens - beam_scorer = self.prepare_beam_scorer() - - tokens = next_tokens.clone() - tokens[0, :] = self.eos_token_id - - with self.parent.assertRaises(ValueError): - beam_scorer.process(input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id) - - # check all batches are done - beam_scorer = self.prepare_beam_scorer() - - tokens = next_tokens.clone() - tokens[:, : self.num_beams] = self.eos_token_id - beam_indices = torch.zeros_like(input_ids) + torch.arange(input_ids.shape[-1], device=input_ids.device) - beam_indices = tuple(tuple(b) for b in beam_indices) - beam_scorer.process( - input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id, beam_indices=beam_indices - ) - # beam scorer should be done - self.parent.assertTrue(beam_scorer.is_done) - - # check - beam_scorer = self.prepare_beam_scorer() - - tokens = next_tokens.clone() - tokens[:, 1] = self.eos_token_id - beam_outputs = beam_scorer.process( - input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id, beam_indices=beam_indices - ) - output_scores = beam_outputs["next_beam_scores"] - output_tokens = beam_outputs["next_beam_tokens"] - output_indices = beam_outputs["next_beam_indices"] - - def cut_expected_tensor(tensor): - return torch.cat([tensor[:, :1], tensor[:, 2 : self.num_beams + 1]], dim=1).flatten() - - # check all outptus - # cut out id of eos token and take best `num_beams` outputs - expected_output_tokens = cut_expected_tensor(tokens) - expected_output_scores = cut_expected_tensor(next_scores) - - # add num_beams * batch_idx - offset = torch.div( - torch.arange(self.num_beams * self.batch_size, device=torch_device), self.num_beams, rounding_mode="floor" - ) - expected_output_indices = cut_expected_tensor(next_indices) + offset * self.num_beams - - self.parent.assertListEqual(expected_output_tokens.tolist(), output_tokens.tolist()) - self.parent.assertListEqual(expected_output_indices.tolist(), output_indices.tolist()) - self.parent.assertTrue(torch.allclose(expected_output_scores, output_scores, atol=1e-3)) - - # make sure ids of eos token are correctly saved in beam_hyps of beam scorer - expected_beam_indices = list(range(10)) - for batch_idx in range(self.batch_size): - correct_idx = batch_idx * self.num_beams + next_indices[batch_idx, 1] - self.parent.assertListEqual( - input_ids[correct_idx].tolist(), beam_scorer._beam_hyps[batch_idx].beams[0][1].tolist() - ) - self.parent.assertListEqual( - expected_beam_indices + [correct_idx], - torch.tensor(beam_scorer._beam_hyps[batch_idx].beams[0][2]).tolist(), - ) - - def check_beam_scores_finalize(self, input_ids, next_tokens, next_indices, next_scores): - # max_length should be only one more than current input_ids to check that eos is correctly appended - max_length = self.sequence_length + 1 - beam_scorer = self.prepare_beam_scorer(num_beam_hyps_to_keep=1, length_penalty=1.0, do_early_stopping=False) - - # update beams and append to input_ids - tokens = next_tokens.clone() - # first batch, first output has to finish with eos token id since scores are correctly sorted - tokens[0, 0] = self.eos_token_id - # make sure corresponding score is as good as possible to surely be picked first - next_scores[0, 0] = 0.0 - beam_outputs = beam_scorer.process( - input_ids, next_scores, tokens, next_indices, eos_token_id=self.eos_token_id - ) - output_scores = beam_outputs["next_beam_scores"] - output_tokens = beam_outputs["next_beam_tokens"] - output_indices = beam_outputs["next_beam_indices"] - - input_ids = torch.cat([input_ids[output_indices, :], output_tokens.unsqueeze(-1)], dim=-1) - - # finalize - beam_indices = torch.zeros_like(input_ids) + torch.arange(input_ids.shape[-1], device=input_ids.device) - beam_indices = tuple(tuple(b) for b in beam_indices) - sequence_output = beam_scorer.finalize( - input_ids, - output_scores, - output_tokens, - output_indices, - pad_token_id=self.pad_token_id, - eos_token_id=self.eos_token_id, - max_length=max_length, - beam_indices=beam_indices, - ) - - sequences = sequence_output["sequences"] - sequence_scores = sequence_output["sequence_scores"] - - # since `num_beam_hyps_to_keep` = 1 => only return `batch_size` x `max_length` - self.parent.assertListEqual(list(sequences.shape), [self.batch_size, max_length]) - self.parent.assertListEqual(list(sequence_scores.shape), [self.batch_size]) - - # check sequence_scores - self.parent.assertFalse((sequence_scores > 0).any().item()) - - # first batch has to finish with eos_token - self.parent.assertEqual(sequences[0, -1].item(), self.eos_token_id) - - # other batches cannot finish with eos token - self.parent.assertNotEqual(sequences[1, -1].item(), self.eos_token_id) - self.parent.assertNotEqual(sequences[2, -1].item(), self.eos_token_id) - - # now test that if `num_beam_hyps_to_keep` is 3 => all beams are returned - beam_scorer.num_beam_hyps_to_keep = self.num_beams - sequence_output = beam_scorer.finalize( - input_ids, - output_scores, - output_tokens, - output_indices, - pad_token_id=self.pad_token_id, - eos_token_id=self.eos_token_id, - max_length=max_length, - beam_indices=beam_indices, - ) - sequences = sequence_output["sequences"] - sequence_scores = sequence_output["sequence_scores"] - - self.parent.assertListEqual(list(sequences.shape), [self.num_beams * self.batch_size, max_length]) - self.parent.assertListEqual(list(sequence_scores.shape), [self.num_beams * self.batch_size]) - - class ConstrainedBeamSearchTester: def __init__( self, @@ -540,24 +322,6 @@ def _check_sequence_inside_sequence(self, tensor_1, tensor_2): return flag -@require_torch -class BeamSearchTest(unittest.TestCase): - def setUp(self): - self.beam_search_tester = BeamSearchTester(self) - - def test_beam_hypotheses(self): - inputs = self.beam_search_tester.prepare_inputs() - self.beam_search_tester.check_beam_hypotheses(*inputs) - - def test_beam_scorer_update(self): - inputs = self.beam_search_tester.prepare_inputs() - self.beam_search_tester.check_beam_scorer_update(*inputs) - - def test_beam_scorer_finalize(self): - inputs = self.beam_search_tester.prepare_inputs() - self.beam_search_tester.check_beam_scores_finalize(*inputs) - - @require_torch class ConstrainedBeamSearchTest(unittest.TestCase): def setUp(self): diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py index 429d61cbd26a..e67e5ba325e0 100644 --- a/tests/generation/test_configuration_utils.py +++ b/tests/generation/test_configuration_utils.py @@ -39,7 +39,6 @@ ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, GenerationMode, - HammingDiversityLogitsProcessor, MinLengthLogitsProcessor, MinNewTokensLengthLogitsProcessor, MinPLogitsWarper, @@ -536,31 +535,6 @@ def prefix_allowed_tokens_fn(batch_id, inputs_ids): ) self.assertEqual(prefix_constrained_logits_proc._num_beams, num_beams) - def test_serialize_generation_diversity_penalty_and_num_bean_groups(self): - """Tests that GenerationConfig is serialized and HammingDiversityLogitsProcessor is initialized with diversity_penalty_and_num_bean_groups""" - num_beams = 2 - num_beam_groups = 2 - diversity_penalty = 1.0 - - generation_config = GenerationConfig( - num_beams=num_beams, diversity_penalty=diversity_penalty, num_beam_groups=num_beam_groups - ) - with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir: - generation_config.save_pretrained(tmp_dir) - new_config = GenerationConfig.from_pretrained(tmp_dir) - self.assertEqual(new_config.num_beams, num_beams) - self.assertEqual(new_config.diversity_penalty, diversity_penalty) - self.assertEqual(new_config.num_beam_groups, num_beam_groups) - - diversity_logits_processor = HammingDiversityLogitsProcessor( - diversity_penalty=new_config.diversity_penalty, - num_beams=new_config.num_beams, - num_beam_groups=new_config.num_beam_groups, - ) - self.assertEqual(diversity_logits_processor._num_beams, num_beams) - self.assertEqual(diversity_logits_processor._diversity_penalty, diversity_penalty) - self.assertEqual(diversity_logits_processor._num_sub_beams, num_beams // num_beam_groups) - def test_serialize_generation_bos_token_id(self): """Tests that GenerationConfig is serialized and ForcedBOSTokenLogitsProcessor is initialized with bos_token_id""" bos_token_id = 0 diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py index df68f9c62100..768e216ef534 100644 --- a/tests/generation/test_logits_process.py +++ b/tests/generation/test_logits_process.py @@ -36,7 +36,6 @@ ExponentialDecayLengthPenalty, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, - HammingDiversityLogitsProcessor, InfNanRemoveLogitsProcessor, LogitNormalization, LogitsProcessorList, @@ -796,36 +795,6 @@ def empty_prefix_allowed_tokens_fn(batch_id, inputs_ids): # processor should not change logits in-place self.assertFalse(torch.all(scores == filtered_scores)) - def test_hamming_diversity(self): - vocab_size = 4 - num_beams = 2 - num_beam_groups = 2 - - scores = self._get_uniform_logits(num_beams, vocab_size) - # batch_idx = 0 -> index batch_idx * num_beam_groups -> idx = 0 * 2 = 0 -> penalises tokens 1 - # batch_idx = 1 -> index batch_idx * num_beam_groups -> idx = 1 * 2 = 2 -> penalises tokens 1 - current_tokens = torch.tensor([0, 3, 1, 2], device=torch_device, dtype=torch.long) - - diversity_logits_processor = HammingDiversityLogitsProcessor( - diversity_penalty=1.0, num_beams=num_beams, num_beam_groups=num_beam_groups - ) - - processed_scores = diversity_logits_processor(None, scores, current_tokens, 1) - - self.assertTrue( - torch.allclose( - processed_scores[0], torch.tensor([-0.7500, 0.2500, 0.2500, 0.2500], device=torch_device), atol=1e-3 - ) - ) - self.assertTrue( - torch.allclose( - processed_scores[1], torch.tensor([0.2500, -0.7500, 0.2500, 0.2500], device=torch_device), atol=1e-3 - ) - ) - - # processor should not change logits in-place - self.assertFalse(torch.all(scores == processed_scores)) - def test_forced_bos_token_logits_processor(self): vocab_size = 20 batch_size = 4 diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 449d8122c12b..bfb1ba4111dd 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -209,17 +209,6 @@ def _get_beam_kwargs(self, num_return_sequences=1): } return beam_kwargs - def _get_diverse_beam_kwargs(self, num_return_sequences=1): - beam_kwargs = { - "early_stopping": False, - "length_penalty": 2.0, - "num_beams": 2, - "num_return_sequences": num_return_sequences, - "num_beam_groups": 2, # one beam per group - "diversity_penalty": 2.0, - } - return beam_kwargs - def _get_constrained_beam_kwargs(self, num_return_sequences=1): beam_kwargs = { "early_stopping": False, @@ -351,36 +340,6 @@ def _beam_sample_generate( return output_generate - def _group_beam_search_generate( - self, - model, - inputs_dict, - beam_kwargs, - output_scores=False, - output_logits=False, - output_attentions=False, - output_hidden_states=False, - return_dict_in_generate=False, - use_cache=True, - ): - logits_processor_kwargs = self._get_logits_processor_kwargs(do_sample=False, config=model.config) - output_generate = model.generate( - do_sample=False, - max_new_tokens=self.max_new_tokens, - min_new_tokens=self.max_new_tokens, - output_scores=output_scores, - output_logits=output_logits, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict_in_generate=return_dict_in_generate, - use_cache=use_cache, - **beam_kwargs, - **logits_processor_kwargs, - **inputs_dict, - ) - - return output_generate - def _constrained_beam_search_generate( self, model, @@ -747,77 +706,6 @@ def test_generate_without_input_ids(self): ) self.assertIsNotNone(output_ids_generate) - @pytest.mark.generate - def test_group_beam_search_generate(self): - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - - model = model_class(config).to(torch_device).eval() - # check `generate()` and `group_beam_search()` are equal - beam_kwargs = self._get_diverse_beam_kwargs() - output_generate = self._group_beam_search_generate( - model=model, - inputs_dict=inputs_dict, - beam_kwargs=beam_kwargs, - ) - if model.config.get_text_config(decoder=True).is_encoder_decoder: - self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1) - else: - self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1]) - - # check `group_beam_search` for higher than 1 `num_return_sequences` - num_return_sequences = 2 - beam_kwargs = self._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences) - output_generate = self._group_beam_search_generate( - model=model, - inputs_dict=inputs_dict, - beam_kwargs=beam_kwargs, - ) - if model.config.get_text_config(decoder=True).is_encoder_decoder: - self.assertTrue(output_generate.shape[1] == self.max_new_tokens + 1) - else: - self.assertTrue(output_generate.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1]) - - @pytest.mark.generate - def test_group_beam_search_generate_dict_output(self): - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - if self.has_attentions: - config._attn_implementation = "eager" # can't output attentions otherwise - - model = model_class(config).to(torch_device).eval() - beam_kwargs = self._get_diverse_beam_kwargs() - output_generate = self._group_beam_search_generate( - model=model, - inputs_dict=inputs_dict, - beam_kwargs=beam_kwargs, - output_scores=True, - output_logits=True, - output_hidden_states=True, - output_attentions=self.has_attentions, - return_dict_in_generate=True, - use_cache=False, - ) - if model.config.get_text_config(decoder=True).is_encoder_decoder: - self.assertTrue(output_generate.sequences.shape[1] == self.max_new_tokens + 1) - self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) - # Retrocompatibility check - self.assertIsInstance(output_generate, BeamSearchEncoderDecoderOutput) - else: - self.assertTrue( - output_generate.sequences.shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1] - ) - self.assertIsInstance(output_generate, GenerateBeamDecoderOnlyOutput) - # Retrocompatibility check - self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) - - self._check_generate_outputs( - output_generate, - model.config, - num_return_sequences=beam_kwargs["num_return_sequences"], - num_beams=beam_kwargs["num_beams"], - ) - @is_flaky() # Some models have position-specific tokens, this test may try to force them in an invalid position @pytest.mark.generate def test_constrained_beam_search_generate(self): @@ -2651,6 +2539,7 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None): @pytest.mark.generate @require_torch class GenerationIntegrationTests(unittest.TestCase): + # TODO joao, manuel: remove in v4.62.0 @slow def test_diverse_beam_search(self): article = """Justin Timberlake and Jessica Biel, welcome to parenthood. @@ -2669,6 +2558,8 @@ def test_diverse_beam_search(self): num_beam_groups=4, diversity_penalty=2.0, remove_invalid_values=True, + trust_remote_code=True, + custom_generate="transformers-community/group-beam-search", ) generated_text = bart_tokenizer.batch_decode(outputs, skip_special_tokens=True) @@ -2828,6 +2719,7 @@ def test_generate_input_values_as_encoder_kwarg(self): self.assertListEqual(output_sequences.tolist(), output_sequences_kwargs.tolist()) self.assertEqual(output_sequences.shape, (2, 5)) + # TODO joao, manuel: remove in v4.62.0 def test_transition_scores_group_beam_search_encoder_decoder(self): articles = [ "Justin Timberlake and Jessica Biel, welcome to parenthood.", @@ -2836,12 +2728,14 @@ def test_transition_scores_group_beam_search_encoder_decoder(self): tokenizer = BartTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart") model = BartForConditionalGeneration.from_pretrained( "hf-internal-testing/tiny-random-bart", + eos_token_id=None, + ) + generation_config = GenerationConfig( max_length=10, num_beams=2, num_beam_groups=2, num_return_sequences=2, diversity_penalty=1.0, - eos_token_id=None, return_dict_in_generate=True, output_scores=True, length_penalty=0.0, @@ -2849,7 +2743,12 @@ def test_transition_scores_group_beam_search_encoder_decoder(self): model = model.to(torch_device) input_ids = tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device) - outputs = model.generate(input_ids=input_ids) + outputs = model.generate( + input_ids=input_ids, + generation_config=generation_config, + trust_remote_code=True, + custom_generate="transformers-community/group-beam-search", + ) transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, outputs.beam_indices) transition_scores_sum = transition_scores.sum(-1) @@ -4820,6 +4719,16 @@ def test_generate_custom_cache_position(self): [ ("transformers-community/dola", {"dola_layers": "low"}), ("transformers-community/contrastive-search", {"penalty_alpha": 0.6, "top_k": 4}), + ( + "transformers-community/group-beam-search", + { + "do_sample": False, + "num_beams": 2, + "num_beam_groups": 2, + "diversity_penalty": 2.0, + "length_penalty": 2.0, + }, + ), ] ) def test_hub_gen_strategies(self, custom_generate, extra_kwargs): diff --git a/tests/models/csm/test_modeling_csm.py b/tests/models/csm/test_modeling_csm.py index f81685abd091..d77a86a201cb 100644 --- a/tests/models/csm/test_modeling_csm.py +++ b/tests/models/csm/test_modeling_csm.py @@ -272,16 +272,6 @@ def test_beam_search_generate_dict_outputs_use_cache(self): def test_beam_sample_generate_dict_output(self): pass - @pytest.mark.generate - @unittest.skip(reason="CSM does not support group beam search.") - def test_group_beam_search_generate(self): - pass - - @pytest.mark.generate - @unittest.skip(reason="CSM does not support group beam search.") - def test_group_beam_search_generate_dict_output(self): - pass - @pytest.mark.generate @unittest.skip(reason="CSM does not support constrained beam search.") def test_constrained_beam_search_generate(self): diff --git a/tests/models/dia/test_modeling_dia.py b/tests/models/dia/test_modeling_dia.py index 5f51649619fe..2f09b65cf8f3 100644 --- a/tests/models/dia/test_modeling_dia.py +++ b/tests/models/dia/test_modeling_dia.py @@ -237,7 +237,6 @@ def skip_non_greedy_generate(self): skippable_tests = [ "test_sample_generate_dict_output", # return sequences > 1 "test_beam", - "test_group_beam", "test_constrained_beam", "test_contrastive", "test_assisted", diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py index df8d2d6e508a..bcb7259004ba 100644 --- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py +++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py @@ -138,16 +138,6 @@ def test_constrained_beam_search_generate_dict_output(self): def test_generate_without_input_ids(self): pass - @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests") - @pytest.mark.generate - def test_group_beam_search_generate(self): - pass - - @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests") - @pytest.mark.generate - def test_group_beam_search_generate_dict_output(self): - pass - @unittest.skip(reason="RecurrentGemma is unusual and fails a lot of generation tests") @pytest.mark.generate def test_constrained_beam_search_generate(self): diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py index 808cef6ddcef..8682c1e75c58 100644 --- a/tests/models/rwkv/test_modeling_rwkv.py +++ b/tests/models/rwkv/test_modeling_rwkv.py @@ -401,13 +401,6 @@ def test_greedy_generate_dict_outputs(self): super().test_greedy_generate_dict_outputs() self.has_attentions = old_has_attentions - def test_group_beam_search_generate_dict_output(self): - # This model has a custom attention output shape AND config flags, let's skip those checks - old_has_attentions = self.has_attentions - self.has_attentions = False - super().test_group_beam_search_generate_dict_output() - self.has_attentions = old_has_attentions - def test_sample_generate_dict_output(self): # This model has a custom attention output shape AND config flags, let's skip those checks old_has_attentions = self.has_attentions diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index bc5b065d918b..a2dcccddb929 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -403,12 +403,6 @@ def _get_beam_kwargs(self, num_return_sequences=1): beam_kwargs["num_return_sequences"] = beam_kwargs["num_beams"] return beam_kwargs - def _get_diverse_beam_kwargs(self, num_return_sequences=1): - # Overwritten from `GenerationTesterMixin`, Whisper's `num_return_sequences` differs from the core `generate` - beam_kwargs = super()._get_diverse_beam_kwargs(num_return_sequences=num_return_sequences) - beam_kwargs["num_return_sequences"] = beam_kwargs["num_beams"] - return beam_kwargs - def _get_constrained_beam_kwargs(self, num_return_sequences=1): # Overwritten from `GenerationTesterMixin`, Whisper's `num_return_sequences` differs from the core `generate` beam_kwargs = super()._get_constrained_beam_kwargs(num_return_sequences=num_return_sequences) diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py index d6a34a361354..9ac8e462e8e3 100644 --- a/tests/utils/test_cache_utils.py +++ b/tests/utils/test_cache_utils.py @@ -434,9 +434,7 @@ def test_offloaded_cache_uses_less_memory_than_dynamic_cache(self): inputs = tokenizer(input_text, return_tensors="pt").to(device) common = { "num_beams": 4, - "num_beam_groups": 2, "num_return_sequences": 4, - "diversity_penalty": 1.0, "max_new_tokens": 20, "early_stopping": True, }