From 68103423b6599fcef15573174025985ea9bda30d Mon Sep 17 00:00:00 2001 From: "Qile.Xu" Date: Thu, 25 Sep 2025 08:01:03 +0000 Subject: [PATCH 1/3] Fix: align Qwen2.5-VL inference rope index with training by passing second_per_grid_ts --- src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py | 1 + src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 6d05cc32f4a8..a98574551922 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1558,6 +1558,7 @@ def prepare_inputs_for_generation( model_inputs.get("input_ids", None), image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, attention_mask=attention_mask, ) self.model.rope_deltas = rope_deltas diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 817d9708d1d6..2a2ee775b7be 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -814,6 +814,7 @@ def prepare_inputs_for_generation( model_inputs.get("input_ids", None), image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, attention_mask=attention_mask, ) self.model.rope_deltas = rope_deltas From 1320b45db1b660c98d2553fee8b86e6e844d81c5 Mon Sep 17 00:00:00 2001 From: "Qile.Xu" Date: Fri, 26 Sep 2025 10:09:37 +0000 Subject: [PATCH 2/3] fix: propagate rope_deltas correctly in Qwen2.5-VL - Forward rope_deltas from Qwen2_5_VLForConditionalGeneration to Qwen2_5_VLModel - Update Qwen2_5_VLModel to accept rope_deltas and store internally - Refactor prepare_inputs_for_generation to unify rope_deltas handling - Ensure that passing rope_deltas in forward() now correctly affects position_ids calculation This fixes an issue where passing rope_deltas directly to the model's forward() had no effect, which could lead to inconsistencies between pre-fill generation and manual forward calls. --- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 13 ++++++++----- .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 13 ++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index a98574551922..d0c1016e57d2 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1282,7 +1282,7 @@ def forward( (cache_position is not None and cache_position[0] == 0) or (past_key_values is None or past_key_values.get_seq_length() == 0) ) - if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None: + if (prefill_compiled_stage or prefill_noncompiled_stage) or rope_deltas is None: position_ids, rope_deltas = self.get_rope_index( input_ids, image_grid_thw, @@ -1290,18 +1290,20 @@ def forward( second_per_grid_ts=second_per_grid_ts, attention_mask=attention_mask, ) - self.rope_deltas = rope_deltas else: batch_size, seq_length, _ = inputs_embeds.shape position_ids = torch.arange(seq_length, device=inputs_embeds.device) position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1) if cache_position is not None: - delta = (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) + delta = (cache_position[0] + rope_deltas).to(inputs_embeds.device) else: delta = torch.zeros((batch_size, seq_length), device=inputs_embeds.device) delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=1) position_ids = position_ids + delta.to(position_ids.device) + if rope_deltas is not None: + self.rope_deltas = rope_deltas + outputs = self.language_model( input_ids=None, position_ids=position_ids, @@ -1321,7 +1323,7 @@ def forward( past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, - rope_deltas=self.rope_deltas, + rope_deltas=rope_deltas, ) return output if return_dict else output.to_tuple() @@ -1479,6 +1481,7 @@ def forward( pixel_values_videos=pixel_values_videos, image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, + rope_deltas=rope_deltas, second_per_grid_ts=second_per_grid_ts, position_ids=position_ids, attention_mask=attention_mask, @@ -1561,7 +1564,7 @@ def prepare_inputs_for_generation( second_per_grid_ts=second_per_grid_ts, attention_mask=attention_mask, ) - self.model.rope_deltas = rope_deltas + model_inputs["rope_deltas"] = rope_deltas # then use the prev pre-calculated rope-deltas to get the correct position ids elif "position_ids" in model_inputs: batch_size, seq_length = model_inputs["position_ids"].shape diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 2a2ee775b7be..d5f488e91ec9 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -606,7 +606,7 @@ def forward( (cache_position is not None and cache_position[0] == 0) or (past_key_values is None or past_key_values.get_seq_length() == 0) ) - if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None: + if (prefill_compiled_stage or prefill_noncompiled_stage) or rope_deltas is None: position_ids, rope_deltas = self.get_rope_index( input_ids, image_grid_thw, @@ -614,17 +614,19 @@ def forward( second_per_grid_ts=second_per_grid_ts, attention_mask=attention_mask, ) - self.rope_deltas = rope_deltas else: batch_size, seq_length, _ = inputs_embeds.shape position_ids = torch.arange(seq_length, device=inputs_embeds.device) position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1) if cache_position is not None: - delta = (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) + delta = (cache_position[0] + rope_deltas).to(inputs_embeds.device) else: delta = torch.zeros((batch_size, seq_length), device=inputs_embeds.device) delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=1) position_ids = position_ids + delta.to(position_ids.device) + + if rope_deltas is not None: + self.rope_deltas = rope_deltas outputs = self.language_model( input_ids=None, @@ -645,7 +647,7 @@ def forward( past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, - rope_deltas=self.rope_deltas, + rope_deltas=rope_deltas, ) return output if return_dict else output.to_tuple() @@ -735,6 +737,7 @@ def forward( pixel_values_videos=pixel_values_videos, image_grid_thw=image_grid_thw, video_grid_thw=video_grid_thw, + rope_deltas=rope_deltas, second_per_grid_ts=second_per_grid_ts, position_ids=position_ids, attention_mask=attention_mask, @@ -817,7 +820,7 @@ def prepare_inputs_for_generation( second_per_grid_ts=second_per_grid_ts, attention_mask=attention_mask, ) - self.model.rope_deltas = rope_deltas + model_inputs["rope_deltas"] = rope_deltas # then use the prev pre-calculated rope-deltas to get the correct position ids elif "position_ids" in model_inputs: batch_size, seq_length = model_inputs["position_ids"].shape From 1dc30020a4ba3d6443ea30fa4a5359a8cf849db1 Mon Sep 17 00:00:00 2001 From: "Qile.Xu" Date: Fri, 26 Sep 2025 11:00:02 +0000 Subject: [PATCH 3/3] Remove white space from blank line. --- src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index d5f488e91ec9..0753aad38826 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -624,7 +624,7 @@ def forward( delta = torch.zeros((batch_size, seq_length), device=inputs_embeds.device) delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=1) position_ids = position_ids + delta.to(position_ids.device) - + if rope_deltas is not None: self.rope_deltas = rope_deltas