Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2055,7 +2055,7 @@ def _prepare_cache_for_generation(
generation_config.cache_implementation = None

generation_config.cache_implementation = generation_config.cache_implementation or getattr(
self.config.get_text_config(), "cache_implementation", None
self.config.get_text_config(decoder=True), "cache_implementation", None
)
if generation_config.cache_implementation is not None:
if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1215,12 +1215,15 @@ def _prepare_model_inputs(
cache_methods = [
"_prepare_cache_for_generation",
"_get_cache",
"_supports_default_dynamic_cache",
"_get_layer_device_map_for_cache_init",
]
for method in cache_methods:
setattr(self.codec_model, method, types.MethodType(getattr(self, method).__func__, self.codec_model))

setattr(
self.codec_model, "_supports_default_dynamic_cache", types.MethodType(lambda x: True, self.codec_model)
)

self.codec_model._prepare_cache_for_generation(
generation_config=self.codec_model.generation_config,
model_kwargs=temporary_model_kwargs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -344,12 +344,15 @@ def _prepare_model_inputs(
cache_methods = [
"_prepare_cache_for_generation",
"_get_cache",
"_supports_default_dynamic_cache",
"_get_layer_device_map_for_cache_init",
]
for method in cache_methods:
setattr(self.codec_model, method, types.MethodType(getattr(self, method).__func__, self.codec_model))

setattr(
self.codec_model, "_supports_default_dynamic_cache", types.MethodType(lambda x: True, self.codec_model)
)

self.codec_model._prepare_cache_for_generation(
generation_config=self.codec_model.generation_config,
model_kwargs=temporary_model_kwargs,
Expand Down
30 changes: 26 additions & 4 deletions src/transformers/models/musicgen/modeling_musicgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1246,7 +1246,29 @@ def generate(
input_ids_length=input_ids_length,
)

# 6. Prepare `input_ids` which will be used for auto-regressive generation
self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)

# 6. Prepare the cache.
# - `model_kwargs` may be updated in place with a cache as defined by the parameters in `generation_config`.
# - different models have a different cache name expected by the model (default = "past_key_values")
# - `max_length`, prepared above, is used to determine the maximum cache length
max_cache_length = generation_config.max_length - 1
if (
input_ids_length.shape[1] != input_ids_length
and model_input_name == "inputs_embeds"
and not self.config.is_encoder_decoder
):
max_cache_length += input_ids_length.shape[1]
self._prepare_cache_for_generation(
generation_config,
model_kwargs,
assistant_model=None,
batch_size=batch_size,
max_cache_length=max_cache_length,
device=input_ids_length.device,
)

# 7. Prepare `input_ids` which will be used for auto-regressive generation
# Build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
input_ids, delay_pattern_mask = self.build_delay_pattern_mask(
input_ids,
Expand All @@ -1260,15 +1282,15 @@ def generate(
# stash the delay mask so that we don't have to recompute it in each forward pass
model_kwargs["delay_pattern_mask"] = delay_pattern_mask

# 7. determine generation mode
# 8. determine generation mode
generation_mode = generation_config.get_generation_mode()

# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
# 9. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
generation_config.guidance_scale = None

# 9. prepare distribution pre_processing samplers
# 10. prepare distribution pre_processing samplers
logits_processor = self._get_logits_processor(
generation_config=generation_config,
input_ids_seq_length=input_ids_length,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2162,6 +2162,28 @@ def generate(
input_ids_length=input_ids_length,
)

self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)

# 7. Prepare the cache.
# - `model_kwargs` may be updated in place with a cache as defined by the parameters in `generation_config`.
# - different models have a different cache name expected by the model (default = "past_key_values")
# - `max_length`, prepared above, is used to determine the maximum cache length
max_cache_length = generation_config.max_length - 1
if (
inputs_tensor.shape[1] != input_ids_length
and model_input_name == "inputs_embeds"
and not self.config.is_encoder_decoder
):
max_cache_length += inputs_tensor.shape[1]
self._prepare_cache_for_generation(
generation_config,
model_kwargs,
assistant_model=None,
batch_size=batch_size,
max_cache_length=max_cache_length,
device=inputs_tensor.device,
)

# build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
input_ids, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
input_ids,
Expand All @@ -2175,15 +2197,15 @@ def generate(
if streamer is not None:
streamer.put(input_ids.cpu())

# 7. determine generation mode
# 8. determine generation mode
generation_mode = generation_config.get_generation_mode()

# 8. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
# 9. prepare batched CFG externally (to enable coexistence with the unbatched CFG)
if generation_config.guidance_scale is not None and generation_config.guidance_scale > 1:
logits_processor.append(ClassifierFreeGuidanceLogitsProcessor(generation_config.guidance_scale))
generation_config.guidance_scale = None

# 9. prepare distribution pre_processing samplers
# 10. prepare distribution pre_processing samplers
logits_processor = self._get_logits_processor(
generation_config=generation_config,
input_ids_seq_length=input_ids_length,
Expand Down
9 changes: 0 additions & 9 deletions src/transformers/models/rag/modeling_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,8 +1204,6 @@ def _reorder_stacked(hidden_states, new_order):
if isinstance(past_key_values, EncoderDecoderCache):
reordered_past = EncoderDecoderCache.from_legacy_cache(reordered_past)

if isinstance(past_key_values, EncoderDecoderCache):
reordered_past = EncoderDecoderCache.from_legacy_cache(reordered_past)
return reordered_past

def marginalize(self, seq_logits, doc_scores, n_docs=None):
Expand Down Expand Up @@ -1593,13 +1591,6 @@ def extend_enc_output(tensor, num_beams=None):
if generation_config.num_return_sequences > generation_config.num_beams:
raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")

# 11. interleave input_ids with `num_beams` additional sequences per batch
input_ids, model_kwargs = self._expand_inputs_for_generation(
input_ids=input_ids,
expand_size=generation_config.num_beams,
is_encoder_decoder=self.config.is_encoder_decoder,
**model_kwargs,
)
return self._beam_search(
input_ids,
logits_processor=pre_processor,
Expand Down
25 changes: 18 additions & 7 deletions src/transformers/models/roformer/modeling_roformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,17 @@ def forward(
.transpose(1, 2)
)

# Apply RoPE if self attention
if not is_cross_attention and sinusoidal_pos is not None:
if self.rotary_value:
query_layer, key_layer, value_layer = self.apply_rotary_position_embeddings(
sinusoidal_pos, query_layer, key_layer, value_layer
)
else:
query_layer, key_layer = self.apply_rotary_position_embeddings(
sinusoidal_pos, query_layer, key_layer
)
Comment on lines +264 to +273
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was it deleted at some point and here you just add it back?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, accidentally deleted RoPE 🙈


if past_key_value is not None:
# save all key/value_layer to cache to be re-used for fast auto-regressive generation
cache_position = cache_position if not is_cross_attention else None
Expand Down Expand Up @@ -381,13 +392,13 @@ def forward(
):
self_outputs = self.self(
hidden_states,
attention_mask,
sinusoidal_pos,
head_mask,
encoder_hidden_states,
past_key_value,
output_attentions,
cache_position,
attention_mask=attention_mask,
sinusoidal_pos=sinusoidal_pos,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
past_key_value=past_key_value,
output_attentions=output_attentions,
cache_position=cache_position,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/superglue/modeling_superglue.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def forward(
# such that the encoder's padding tokens are not attended to.
is_cross_attention = encoder_hidden_states is not None
current_states = encoder_hidden_states if is_cross_attention else hidden_states
attention_mask = encoder_attention_mask if is_cross_attention else encoder_attention_mask
attention_mask = encoder_attention_mask if is_cross_attention else attention_mask

batch_size = hidden_states.shape[0]
key_layer = (
Expand Down
4 changes: 2 additions & 2 deletions tests/models/llava_next/test_modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ def test_small_model_integration_test_full_vision_state_selection(self):
# test that changing `strategy` won't error out
model.vision_feature_select_strategy = "full"

inputs = self.processor(self.prompt, self.image, return_tensors="pt").to(model.device)
inputs = self.processor(text=self.prompt, images=self.image, return_tensors="pt").to(model.device)

# verify generation
output = model.generate(**inputs, max_new_tokens=30)
Expand All @@ -536,7 +536,7 @@ def test_granite_vision(self):
model = LlavaNextForConditionalGeneration.from_pretrained(granite_model_path)
self.processor = AutoProcessor.from_pretrained(granite_model_path)
prompt = "<|user|>\n<image>\nWhat is shown in this image?\n<|assistant|>\n"
inputs = self.processor(prompt, self.image, return_tensors="pt").to(model.device)
inputs = self.processor(text=prompt, images=self.image, return_tensors="pt").to(model.device)

# verify generation
output = model.generate(**inputs, max_new_tokens=30)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,9 @@ def test_small_model_integration_test_batch_matches_single(self):
padding=True,
).to(torch_device)

inputs_single = self.processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
inputs_single = self.processor(text=self.prompt_video, videos=[self.video], return_tensors="pt").to(
torch_device
)

# verify generation
output_batched = model.generate(**inputs_batched, do_sample=False, max_new_tokens=50)
Expand Down
3 changes: 1 addition & 2 deletions tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,6 @@ def attention_mask_padding_matches_padding_free_with_position_ids(
logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
logits_padfree = res_padfree.logits[0]

torch.testing.assert_close(logits_padded.argmax(-1), logits_padfree.argmax(-1), rtol=0, atol=0)
# acceptable numerical instability
tol = torch.finfo(torch.bfloat16).eps
torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
Expand Down Expand Up @@ -698,7 +697,7 @@ def test_small_model_integration_test_batch(self):
)
text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(
text=text * 2,
text=[text] * 2,
audio=[self.raw_audio, self.raw_audio],
images=[self.raw_image, self.raw_image],
return_tensors="pt",
Expand Down
1 change: 0 additions & 1 deletion tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,6 @@ def attention_mask_padding_matches_padding_free_with_position_ids(
logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
logits_padfree = res_padfree.logits[0]

torch.testing.assert_close(logits_padded.argmax(-1), logits_padfree.argmax(-1), rtol=0, atol=0)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, can't find it on the common test file, so fine.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this one doesn't make sense because we want to check logits, not sampled argmax tokens. Even tiny diff in logits can give different tokens, and the next line check with torch.allclose is enough

# acceptable numerical instability
tol = torch.finfo(torch.bfloat16).eps
torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
Expand Down
1 change: 0 additions & 1 deletion tests/models/qwen2_vl/test_modeling_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,6 @@ def attention_mask_padding_matches_padding_free_with_position_ids(
logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()]
logits_padfree = res_padfree.logits[0]

torch.testing.assert_close(logits_padded.argmax(-1), logits_padfree.argmax(-1), rtol=0, atol=0)
# acceptable numerical instability
tol = torch.finfo(torch.bfloat16).eps
torch.testing.assert_close(logits_padded, logits_padfree, rtol=tol, atol=tol)
Expand Down
8 changes: 4 additions & 4 deletions tests/pipelines/test_pipelines_image_text_to_text.py
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you know what causse this changes?

Copy link
Copy Markdown
Member Author

@zucchini-nlp zucchini-nlp Jul 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, after #39374 we started using higher default max length. Text-only generation pipe already uses it, so it's fine

Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def test_small_model_pt_token_text_only(self):
},
{
"role": "assistant",
"content": "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom",
"content": "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom natural language processing\nTo machine learning and more, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and services\nFrom image and speech recognition\nTo text and language translation, they've got it all\n\nThey've made it possible for us to be more\nInformed and efficient, with their tools and",
},
],
}
Expand Down Expand Up @@ -150,7 +150,7 @@ def test_small_model_pt_token(self):
[
{
"input_text": "<image> What this is? Assistant: This is",
"generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
"generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable. The photo captures a moment of tranquility and companionship between the two feline friends.",
}
],
)
Expand All @@ -161,11 +161,11 @@ def test_small_model_pt_token(self):
[
{
"input_text": "<image> What this is? Assistant: This is",
"generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they",
"generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a couch, and the cats are positioned in such a way that they are facing the camera. The image captures a peaceful moment between the two cats, and it's a great way to showcase their cuteness and relaxed demeanor.",
},
{
"input_text": "<image> What this is? Assistant: This is",
"generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they",
"generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are facing the camera, and they appear to be sleeping or resting. The blanket is placed on a couch, and the cats are positioned in such a way that they are facing the camera. The image captures a peaceful moment between the two cats, and it's a great way to showcase their cuteness and relaxed demeanor.",
},
],
)
Expand Down