diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 2907b2b987cb..c6d694f68218 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2771,6 +2771,7 @@ def _get_resized_embeddings( old_num_tokens, old_embedding_dim = old_embeddings.weight.size() if old_num_tokens == new_num_tokens and not is_deepspeed_zero3_enabled(): + old_embeddings.num_embeddings = new_num_tokens # maybe weights are tied which doesn't update attr return old_embeddings if not isinstance(old_embeddings, nn.Embedding): @@ -2910,6 +2911,7 @@ def _get_resized_lm_head( ) if old_num_tokens == new_num_tokens and not is_deepspeed_zero3_enabled(): + old_lm_head.out_features = new_num_tokens # maybe weights are tied which doesn't update attr return old_lm_head if not isinstance(old_lm_head, nn.Linear): diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index 3ecd6344dc07..a197a02ed590 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -960,12 +960,6 @@ def __init__(self, config: Gemma3Config): self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) self.post_init() - def get_input_embeddings(self): - return self.model.get_input_embeddings() - - def set_input_embeddings(self, value): - self.model.set_input_embeddings(value) - @auto_docstring def get_image_features(self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]): return self.model.get_image_features(pixel_values, **kwargs) diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index d78002ed76c3..e61c5f0038e7 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -30,7 +30,6 @@ if is_timm_available(): from timm.data import ImageNetInfo, infer_imagenet_subset - logger = logging.get_logger(__name__) diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 3a41bb261c43..9ebf8a5d1c07 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -43,6 +43,7 @@ TransformersKwargs, auto_docstring, can_return_tuple, + is_accelerate_available, torch_compilable_check, ) from ...utils.generic import maybe_autocast, merge_with_config_defaults @@ -51,6 +52,10 @@ from .configuration_gemma3n import Gemma3nAudioConfig, Gemma3nConfig, Gemma3nTextConfig, Gemma3nVisionConfig +if is_accelerate_available(): + from accelerate.hooks import add_hook_to_module + + @dataclass @auto_docstring class Gemma3nAudioEncoderModelOutput(BaseModelOutputWithPooling): @@ -1406,6 +1411,44 @@ def _init_weights(self, module): if hasattr(module, "gradient_clipping"): init.constant_(module.gradient_clipping, self.config.gradient_clipping) + def get_per_layer_input_embeddings(self): + return self.base_model.embed_tokens_per_layer + + def set_per_layer_input_embeddings(self, value): + self.base_model.embed_tokens_per_layer = value + + def resize_token_embeddings( + self, + new_num_tokens: int | None = None, + pad_to_multiple_of: int | None = None, + mean_resizing: bool = True, + ) -> nn.Embedding: + inputs_embeds = super().resize_token_embeddings( + new_num_tokens=new_num_tokens, + pad_to_multiple_of=pad_to_multiple_of, + mean_resizing=mean_resizing, + ) + self._resize_per_layer_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + return inputs_embeds + + def _resize_per_layer_embeddings( + self, + new_num_tokens: int | None = None, + pad_to_multiple_of: int | None = None, + mean_resizing: bool = True, + ): + self.config.get_text_config().vocab_size_per_layer_input = self.vocab_size + if self.config.get_text_config().hidden_size_per_layer_input: + embed_tokens_per_layer = self.get_per_layer_input_embeddings() + new_embeddings_per_layer = self._get_resized_embeddings( + embed_tokens_per_layer, new_num_tokens, pad_to_multiple_of, mean_resizing + ) + if hasattr(embed_tokens_per_layer, "_hf_hook"): + hook = embed_tokens_per_layer._hf_hook + add_hook_to_module(new_embeddings_per_layer, hook) + new_embeddings_per_layer.requires_grad_(embed_tokens_per_layer.weight.requires_grad) + self.set_per_layer_input_embeddings(new_embeddings_per_layer) + class Gemma3nAudioEncoder(Gemma3nPreTrainedModel): """ @@ -2128,6 +2171,12 @@ def forward( audio_hidden_states=audio_features if input_features is not None else None, ) + def get_per_layer_input_embeddings(self): + return self.language_model.embed_tokens_per_layer + + def set_per_layer_input_embeddings(self, value): + self.language_model.embed_tokens_per_layer = value + @can_return_tuple @auto_docstring(custom_intro="Projects the last hidden state from the audio encoder into language model space.") def get_audio_features( @@ -2167,12 +2216,6 @@ def __init__(self, config: Gemma3nConfig): self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) self.post_init() - def get_input_embeddings(self): - return self.model.get_input_embeddings() - - def set_input_embeddings(self, value): - self.model.set_input_embeddings(value) - @auto_docstring def get_image_features(self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]): return self.model.get_image_features(pixel_values, **kwargs) @@ -2323,6 +2366,12 @@ def prepare_inputs_for_generation( return model_inputs + def get_per_layer_input_embeddings(self): + return self.model.get_per_layer_input_embeddings() + + def set_per_layer_input_embeddings(self, value): + self.model.set_per_layer_input_embeddings(value) + __all__ = [ "Gemma3nAudioEncoder", diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index c531291fc584..12146b7954e3 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -31,7 +31,14 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_compilable_check +from ...utils import ( + TransformersKwargs, + auto_docstring, + can_return_tuple, + is_accelerate_available, + logging, + torch_compilable_check, +) from ...utils.generic import merge_with_config_defaults from ...utils.output_capturing import capture_outputs from ..auto import AutoModel @@ -59,6 +66,9 @@ from ..timm_wrapper.configuration_timm_wrapper import TimmWrapperConfig +if is_accelerate_available(): + from accelerate.hooks import add_hook_to_module + logger = logging.get_logger(__name__) @@ -1653,6 +1663,44 @@ def _init_weights(self, module): if hasattr(module, "gradient_clipping"): init.constant_(module.gradient_clipping, self.config.gradient_clipping) + def get_per_layer_input_embeddings(self): + return self.base_model.embed_tokens_per_layer + + def set_per_layer_input_embeddings(self, value): + self.base_model.embed_tokens_per_layer = value + + def resize_token_embeddings( + self, + new_num_tokens: int | None = None, + pad_to_multiple_of: int | None = None, + mean_resizing: bool = True, + ) -> nn.Embedding: + inputs_embeds = super().resize_token_embeddings( + new_num_tokens=new_num_tokens, + pad_to_multiple_of=pad_to_multiple_of, + mean_resizing=mean_resizing, + ) + self._resize_per_layer_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + return inputs_embeds + + def _resize_per_layer_embeddings( + self, + new_num_tokens: int | None = None, + pad_to_multiple_of: int | None = None, + mean_resizing: bool = True, + ): + self.config.get_text_config().vocab_size_per_layer_input = self.vocab_size + if self.config.get_text_config().hidden_size_per_layer_input: + embed_tokens_per_layer = self.get_per_layer_input_embeddings() + new_embeddings_per_layer = self._get_resized_embeddings( + embed_tokens_per_layer, new_num_tokens, pad_to_multiple_of, mean_resizing + ) + if hasattr(embed_tokens_per_layer, "_hf_hook"): + hook = embed_tokens_per_layer._hf_hook + add_hook_to_module(new_embeddings_per_layer, hook) + new_embeddings_per_layer.requires_grad_(embed_tokens_per_layer.weight.requires_grad) + self.set_per_layer_input_embeddings(new_embeddings_per_layer) + class Gemma3nAudioEncoder(Gemma3nPreTrainedModel): """ @@ -1995,6 +2043,12 @@ def __init__(self, config: Gemma3nConfig): self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config, config.text_config) self.embed_audio = Gemma3nMultimodalEmbedder(config.audio_config, config.text_config) + def get_per_layer_input_embeddings(self): + return self.language_model.embed_tokens_per_layer + + def set_per_layer_input_embeddings(self, value): + self.language_model.embed_tokens_per_layer = value + @can_return_tuple @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.") def get_image_features( @@ -2230,6 +2284,12 @@ def get_audio_features( class Gemma3nForConditionalGeneration(PaliGemmaForConditionalGeneration): accepts_loss_kwargs = False + def get_per_layer_input_embeddings(self): + return self.model.get_per_layer_input_embeddings() + + def set_per_layer_input_embeddings(self, value): + self.model.set_per_layer_input_embeddings(value) + @can_return_tuple @auto_docstring def forward( diff --git a/src/transformers/models/gemma4/modeling_gemma4.py b/src/transformers/models/gemma4/modeling_gemma4.py index 406aa0ac72cd..a9b2de0daeb6 100644 --- a/src/transformers/models/gemma4/modeling_gemma4.py +++ b/src/transformers/models/gemma4/modeling_gemma4.py @@ -46,13 +46,24 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check +from ...utils import ( + ModelOutput, + TransformersKwargs, + auto_docstring, + can_return_tuple, + is_accelerate_available, + torch_compilable_check, +) from ...utils.generic import maybe_autocast, merge_with_config_defaults from ...utils.output_capturing import OutputRecorder, capture_outputs from ..auto.modeling_auto import AutoModel from .configuration_gemma4 import Gemma4AudioConfig, Gemma4Config, Gemma4TextConfig, Gemma4VisionConfig +if is_accelerate_available(): + from accelerate.hooks import add_hook_to_module + + @dataclass @auto_docstring( custom_intro=""" @@ -1425,19 +1436,20 @@ def forward(self, input_ids: torch.Tensor): return super().forward(input_ids) * self.embed_scale.to(self.weight.dtype) -# ---- Model Classes ---- - - +@auto_docstring class Gemma4PreTrainedModel(PreTrainedModel): config: Gemma4Config + base_model_prefix = "model" supports_gradient_checkpointing = True + _no_split_modules = ["Gemma4TextDecoderLayer", "Gemma4VisionEncoderLayer", "Gemma4AudioLayer"] + _skip_keys_device_placement = ["past_key_values", "shared_kv_states"] _supports_flash_attn = True _supports_sdpa = True _supports_flex_attn = True + _can_compile_fullgraph = True _supports_attention_backend = True - _no_split_modules = ["Gemma4TextDecoderLayer", "Gemma4VisionEncoderLayer", "Gemma4AudioLayer"] - _skip_keys_device_placement = ["past_key_values", "shared_kv_states"] + _can_record_outputs = None # override input_modalities = ("image", "text", "video", "audio") @torch.no_grad() @@ -1493,6 +1505,44 @@ def _init_weights(self, module): init.zeros_(module.std_bias) init.ones_(module.std_scale) + def get_per_layer_input_embeddings(self): + return self.base_model.embed_tokens_per_layer + + def set_per_layer_input_embeddings(self, value): + self.base_model.embed_tokens_per_layer = value + + def resize_token_embeddings( + self, + new_num_tokens: int | None = None, + pad_to_multiple_of: int | None = None, + mean_resizing: bool = True, + ) -> nn.Embedding: + inputs_embeds = super().resize_token_embeddings( + new_num_tokens=new_num_tokens, + pad_to_multiple_of=pad_to_multiple_of, + mean_resizing=mean_resizing, + ) + self._resize_per_layer_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + return inputs_embeds + + def _resize_per_layer_embeddings( + self, + new_num_tokens: int | None = None, + pad_to_multiple_of: int | None = None, + mean_resizing: bool = True, + ): + self.config.get_text_config().vocab_size_per_layer_input = self.vocab_size + if self.config.get_text_config().hidden_size_per_layer_input: + embed_tokens_per_layer = self.get_per_layer_input_embeddings() + new_embeddings_per_layer = self._get_resized_embeddings( + embed_tokens_per_layer, new_num_tokens, pad_to_multiple_of, mean_resizing + ) + if hasattr(embed_tokens_per_layer, "_hf_hook"): + hook = embed_tokens_per_layer._hf_hook + add_hook_to_module(new_embeddings_per_layer, hook) + new_embeddings_per_layer.requires_grad_(embed_tokens_per_layer.weight.requires_grad) + self.set_per_layer_input_embeddings(new_embeddings_per_layer) + @auto_docstring(custom_intro="The base Gemma 4 language model without a language modeling head.") class Gemma4TextModel(Gemma4PreTrainedModel): @@ -2331,6 +2381,12 @@ def forward( audio_hidden_states=audio_features if input_features is not None else None, ) + def get_per_layer_input_embeddings(self): + return self.language_model.embed_tokens_per_layer + + def set_per_layer_input_embeddings(self, value): + self.language_model.embed_tokens_per_layer = value + @can_return_tuple @auto_docstring(custom_intro="Projects the last hidden state from the audio encoder into language model space.") def get_audio_features( @@ -2402,12 +2458,6 @@ def __init__(self, config: Gemma4Config): ] self.post_init() - def get_input_embeddings(self): - return self.model.get_input_embeddings() - - def set_input_embeddings(self, value): - self.model.set_input_embeddings(value) - @auto_docstring def get_image_features( self, @@ -2536,6 +2586,12 @@ def prepare_inputs_for_generation( return model_inputs + def get_per_layer_input_embeddings(self): + return self.model.get_per_layer_input_embeddings() + + def set_per_layer_input_embeddings(self, value): + self.model.set_per_layer_input_embeddings(value) + @staticmethod def create_masks_for_generate( config: PreTrainedConfig, diff --git a/src/transformers/models/gemma4/modular_gemma4.py b/src/transformers/models/gemma4/modular_gemma4.py index 3f43ef1075da..c68985fb0255 100644 --- a/src/transformers/models/gemma4/modular_gemma4.py +++ b/src/transformers/models/gemma4/modular_gemma4.py @@ -41,6 +41,7 @@ TransformersKwargs, auto_docstring, can_return_tuple, + is_accelerate_available, logging, torch_compilable_check, ) @@ -62,6 +63,7 @@ Gemma3nModel, Gemma3nModelOutputWithPast, Gemma3nMultimodalEmbedder, + Gemma3nPreTrainedModel, Gemma3nRMSNorm, apply_rotary_pos_emb, eager_attention_forward, @@ -72,6 +74,10 @@ from .configuration_gemma4 import Gemma4AudioConfig, Gemma4Config, Gemma4TextConfig, Gemma4VisionConfig +if is_accelerate_available(): + pass + + logger = logging.get_logger(__name__) @@ -1152,21 +1158,15 @@ class Gemma4TextScaledWordEmbedding(Gemma3TextScaledWordEmbedding): # ---- Model Classes ---- -class Gemma4PreTrainedModel(PreTrainedModel): - config: Gemma4Config - supports_gradient_checkpointing = True - _supports_flash_attn = True - _supports_sdpa = True - _supports_flex_attn = True - _can_compile_fullgraph = True - _supports_attention_backend = True +class Gemma4PreTrainedModel(Gemma3nPreTrainedModel): _no_split_modules = ["Gemma4TextDecoderLayer", "Gemma4VisionEncoderLayer", "Gemma4AudioLayer"] - _skip_keys_device_placement = ["past_key_values", "shared_kv_states"] input_modalities = ("image", "text", "video", "audio") + _can_record_outputs = None # override + _skip_keys_device_placement = ["past_key_values", "shared_kv_states"] @torch.no_grad() def _init_weights(self, module): - super()._init_weights(module) + PreTrainedModel._init_weights(module) if isinstance(module, Gemma4VisionPatchEmbedder): init.ones_(module.position_embedding_table) elif isinstance(module, Gemma4AudioRelPositionalEncoding): @@ -1720,6 +1720,12 @@ def __init__(self, config: Gemma4Config): f"language_model.{name}" for name in self.language_model._keys_to_ignore_on_load_unexpected ] + def get_per_layer_input_embeddings(self): + return self.language_model.embed_tokens_per_layer + + def set_per_layer_input_embeddings(self, value): + self.language_model.embed_tokens_per_layer = value + @can_return_tuple @auto_docstring(custom_intro="Projects the last hidden state from the vision model into language model space.") def get_image_features( @@ -2011,6 +2017,12 @@ def __init__(self, config: Gemma4Config): f"model.{name}" for name in self.model._keys_to_ignore_on_load_unexpected ] + def get_per_layer_input_embeddings(self): + return self.model.get_per_layer_input_embeddings() + + def set_per_layer_input_embeddings(self, value): + self.model.set_per_layer_input_embeddings(value) + def forward( self, input_ids: torch.LongTensor | None = None, diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index 369514a55f76..6eeeaa6bd681 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -398,12 +398,6 @@ def __init__(self, config: PaliGemmaConfig): self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) self.post_init() - def get_input_embeddings(self): - return self.model.get_input_embeddings() - - def set_input_embeddings(self, value): - self.model.set_input_embeddings(value) - @auto_docstring def get_image_features(self, pixel_values: torch.FloatTensor, **kwargs: Unpack[TransformersKwargs]): return self.model.get_image_features(pixel_values, **kwargs) diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index a6b9b5392194..1f41875c5def 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -961,6 +961,12 @@ def __init__(self, config: T5GemmaConfig): def set_output_embeddings(self, new_embeddings): self.lm_head.out_proj = new_embeddings + # The tying happens from decoder to lm-head, but when resizing + # the resized embed is assigned only to the head. Then tying weights + # again reverts everything back. So we have to update decoder here + if self.config.tie_word_embeddings: + self.model.decoder.embed_tokens.weight = new_embeddings.weight + self.model.decoder.embed_tokens.num_embeddings = new_embeddings.weight.shape[0] def get_output_embeddings(self): return self.lm_head.out_proj diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index c7d4a4051959..1c8846ad74b9 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -800,6 +800,12 @@ def __init__(self, config: T5GemmaConfig): def set_output_embeddings(self, new_embeddings): self.lm_head.out_proj = new_embeddings + # The tying happens from decoder to lm-head, but when resizing + # the resized embed is assigned only to the head. Then tying weights + # again reverts everything back. So we have to update decoder here + if self.config.tie_word_embeddings: + self.model.decoder.embed_tokens.weight = new_embeddings.weight + self.model.decoder.embed_tokens.num_embeddings = new_embeddings.weight.shape[0] def get_output_embeddings(self): return self.lm_head.out_proj diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 47a30b88db83..0a3561d45db2 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -652,10 +652,10 @@ def prepare_config_and_inputs_for_common(self): config, input_ids, attention_mask, pixel_values = config_and_inputs inputs_dict = { "input_ids": input_ids, - "decoder_input_ids": input_ids, + "decoder_input_ids": input_ids.clone(), "attention_mask": attention_mask, "pixel_values": pixel_values, - "labels": input_ids, + "labels": input_ids.clone(), } return config, inputs_dict diff --git a/tests/models/colmodernvbert/test_modeling_colmodernvbert.py b/tests/models/colmodernvbert/test_modeling_colmodernvbert.py index 2f5134036d52..4b7391407529 100755 --- a/tests/models/colmodernvbert/test_modeling_colmodernvbert.py +++ b/tests/models/colmodernvbert/test_modeling_colmodernvbert.py @@ -49,6 +49,7 @@ def __init__( parent, batch_size=2, num_images=2, + seq_length=7, ignore_index=-100, text_config=None, is_training=False, @@ -98,10 +99,11 @@ def __init__( self.pixel_shuffle_factor = pixel_shuffle_factor self.image_token_id = self.text_config["vocab_size"] - 1 self.pad_token_id = text_config["pad_token_id"] - self.seq_length = ( + self.image_seq_length = ( int(((vision_config["image_size"] // vision_config["patch_size"]) ** 2) / (pixel_shuffle_factor**2)) * self.num_images ) + self.seq_length = seq_length + self.image_seq_length self.hidden_size = text_config["hidden_size"] self.num_hidden_layers = text_config["num_hidden_layers"] @@ -136,9 +138,9 @@ def prepare_config_and_inputs_for_common(self): input_ids = ids_tensor([self.batch_size, self.seq_length], config.vlm_config.text_config.vocab_size) attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - # For simplicity just set the last n tokens to the image token - n_image_tokens_per_batch = self.seq_length - input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id + # For simplicity just set the first n tokens to the image token + input_ids[input_ids == self.image_token_id] = self.pad_token_id + input_ids[:, : self.image_seq_length] = self.image_token_id attention_mask = input_ids.ne(1).to(torch_device) inputs_dict = { "pixel_values": pixel_values, diff --git a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py index c14e3933f77b..6e0576efe3d5 100644 --- a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py +++ b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py @@ -135,7 +135,7 @@ def prepare_config_and_inputs_for_common(self): # For simplicity just set the last n tokens to the image token input_ids[input_ids == self.image_token_id] = self.text_config["pad_token_id"] - input_ids[:, -self.image_seq_length :] = self.image_token_id + input_ids[:, : self.image_seq_length] = self.image_token_id attention_mask = input_ids.ne(1).to(torch_device) inputs_dict = { diff --git a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py index 1284ff45be0f..9874ce4a8203 100644 --- a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py @@ -103,8 +103,8 @@ def place_image_tokens(self, input_ids, config): input_ids[input_ids == self.image_token_id] = self.pad_token_id input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id # Place image tokens with vision_start_token_id prefix - input_ids[:, -1] = self.image_token_id - input_ids[:, -2] = self.vision_start_token_id + input_ids[:, 1] = self.image_token_id + input_ids[:, 0] = self.vision_start_token_id return input_ids def get_additional_inputs(self, config, input_ids, pixel_values): diff --git a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py index 2de7b384d075..0b0523de3b71 100644 --- a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py +++ b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py @@ -102,8 +102,8 @@ def place_image_tokens(self, input_ids, config): input_ids[input_ids == self.image_token_id] = self.pad_token_id input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id # Place image tokens with vision_start_token_id prefix - input_ids[:, -1] = self.image_token_id - input_ids[:, -2] = self.vision_start_token_id + input_ids[:, 1] = self.image_token_id + input_ids[:, 0] = self.vision_start_token_id return input_ids def get_additional_inputs(self, config, input_ids, pixel_values): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 9dbf44c03c12..49691fc21407 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2162,6 +2162,9 @@ def test_resize_tokens_embeddings(self): # Check that the model can still do a forward pass successfully (every parameter should be resized) if not is_deepspeed_zero3_enabled(): + # Input ids should be expanded to the new maximum size of the vocabulary + inputs_dict["input_ids"][:, -2] = new_model_vocab_size - 1 + # A distriputed launcher is needed for the forward pass when deepspeed is enabled model_inputs = self._prepare_for_class(inputs_dict, model_class) model(**model_inputs)