From 2a94397302018fa453af2608c1eddbccc2579e50 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 4 Nov 2024 16:13:17 +0100 Subject: [PATCH 01/12] remove manual assignment tie-word-embeddings --- src/transformers/modeling_utils.py | 9 ++++++--- src/transformers/models/blip_2/configuration_blip_2.py | 1 - .../models/instructblip/configuration_instructblip.py | 1 - .../instructblipvideo/configuration_instructblipvideo.py | 1 - tests/models/blip_2/test_modeling_blip_2.py | 7 +++++++ tests/models/instructblip/test_modeling_instructblip.py | 5 ++++- .../instructblipvideo/test_modeling_instructblipvideo.py | 3 ++- 7 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 2ef4c3615c9f..685ac9117e86 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1919,7 +1919,7 @@ def tie_weights(self): If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the weights instead. """ - if getattr(self.config, "tie_word_embeddings", True): + if getattr(self.config.get_text_config(), "tie_word_embeddings", True): output_embeddings = self.get_output_embeddings() if output_embeddings is not None: self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) @@ -2161,7 +2161,7 @@ def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean new_num_tokens = new_embeddings.weight.shape[0] # if word embeddings are not tied, make sure that lm head is resized as well - if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings: + if self.get_output_embeddings() is not None and not self.config.get_text_config().tie_word_embeddings: old_lm_head = self.get_output_embeddings() if isinstance(old_lm_head, torch.nn.Embedding): new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens, mean_resizing=mean_resizing) @@ -4515,7 +4515,10 @@ def _fix_key(key): _loaded_keys = loaded_keys not_initialized_submodules = set_initialized_submodules(model, _loaded_keys) # If we're about to tie the output embeds to the input embeds we don't need to init them - if hasattr(model.config, "tie_word_embeddings") and model.config.tie_word_embeddings: + if ( + hasattr(model.config.get_text_config(), "tie_word_embeddings") + and model.config.get_text_config().tie_word_embeddings + ): output_embeddings = model.get_output_embeddings() if output_embeddings is not None: # Still need to initialize if there is a bias term since biases are not tied. diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 16fa4aec3849..9d55e396cf79 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -336,7 +336,6 @@ def __init__( text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" self.text_config = CONFIG_MAPPING[text_model_type](**text_config) - self.tie_word_embeddings = self.text_config.tie_word_embeddings self.is_encoder_decoder = self.text_config.is_encoder_decoder self.num_query_tokens = num_query_tokens diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index a274212a945e..f9633e3c3aed 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -334,7 +334,6 @@ def __init__( text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" self.text_config = CONFIG_MAPPING[text_model_type](**text_config) - self.tie_word_embeddings = self.text_config.tie_word_embeddings self.is_encoder_decoder = self.text_config.is_encoder_decoder self.num_query_tokens = num_query_tokens diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index e7c8eeccef98..f3c1ece52bce 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -339,7 +339,6 @@ def __init__( text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" self.text_config = CONFIG_MAPPING[text_model_type](**text_config) - self.tie_word_embeddings = self.text_config.tie_word_embeddings self.is_encoder_decoder = self.text_config.is_encoder_decoder self.num_query_tokens = num_query_tokens diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 1ec9c2e1c07c..7fc6a28e6f87 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -1115,6 +1115,13 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = Blip2ModelTester(self) + common_properties = ["image_token_index", "num_query_tokens", "image_text_hidden_size"] + self.config_tester = ConfigTester( + self, config_class=Blip2Config, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index f06caeb03778..68bd0de82a02 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -158,7 +158,10 @@ class InstructBlipVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = InstructBlipVisionModelTester(self) self.config_tester = ConfigTester( - self, config_class=InstructBlipVisionConfig, has_text_modality=False, hidden_size=37 + self, + config_class=InstructBlipConfig, + has_text_modality=False, + common_properties=["num_query_tokens", "image_token_index"], ) def test_config(self): diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 7e0bf4eaf0a2..b62e19657dcb 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -163,8 +163,9 @@ class InstructBlipVideoVisionModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = InstructBlipVideoVisionModelTester(self) + common_properties = ["num_query_tokens", "video_token_index"] self.config_tester = ConfigTester( - self, config_class=InstructBlipVideoVisionConfig, has_text_modality=False, hidden_size=37 + self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties ) def test_config(self): From a35663e225a950a8dc58f83f1fad2e072a028e7e Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 4 Nov 2024 16:27:15 +0100 Subject: [PATCH 02/12] remove another unused attribute --- src/transformers/models/blip_2/configuration_blip_2.py | 2 -- .../models/instructblip/configuration_instructblip.py | 2 -- .../models/instructblipvideo/configuration_instructblipvideo.py | 2 -- 3 files changed, 6 deletions(-) diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 9d55e396cf79..38d36ef8362b 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -336,8 +336,6 @@ def __init__( text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" self.text_config = CONFIG_MAPPING[text_model_type](**text_config) - self.is_encoder_decoder = self.text_config.is_encoder_decoder - self.num_query_tokens = num_query_tokens self.image_text_hidden_size = image_text_hidden_size self.image_token_index = image_token_index diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index f9633e3c3aed..3bc5602f7a7b 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -334,8 +334,6 @@ def __init__( text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" self.text_config = CONFIG_MAPPING[text_model_type](**text_config) - self.is_encoder_decoder = self.text_config.is_encoder_decoder - self.num_query_tokens = num_query_tokens self.image_token_index = image_token_index self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index f3c1ece52bce..6c5e33cce49f 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -339,8 +339,6 @@ def __init__( text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" self.text_config = CONFIG_MAPPING[text_model_type](**text_config) - self.is_encoder_decoder = self.text_config.is_encoder_decoder - self.num_query_tokens = num_query_tokens self.video_token_index = video_token_index self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size From da627aab8d2ec6fa32a811b22c8901b0159cf7ed Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 19 Nov 2024 13:01:34 +0100 Subject: [PATCH 03/12] fix tests --- .../modeling_speech_encoder_decoder.py | 3 +++ .../modeling_vision_encoder_decoder.py | 9 +++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index 0d2b911bebe5..d0d6fa588ec1 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -261,6 +261,9 @@ def get_encoder(self): def get_decoder(self): return self.decoder + def get_input_embeddings(self): + return self.decoder.get_input_embeddings() + def get_output_embeddings(self): return self.decoder.get_output_embeddings() diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index 152a96014033..b630eac87352 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -237,6 +237,9 @@ def get_encoder(self): def get_decoder(self): return self.decoder + def get_input_embeddings(self): + return self.decoder.get_input_embeddings() + def get_output_embeddings(self): return self.decoder.get_output_embeddings() @@ -659,12 +662,6 @@ def forward( def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) - def resize_token_embeddings(self, *args, **kwargs): - raise NotImplementedError( - "Resizing the embedding layers via the VisionEncoderDecoderModel directly is not supported.Please use the" - " respective methods of the wrapped decoder object (model.decoder.resize_token_embeddings(...))" - ) - def _reorder_cache(self, past_key_values, beam_idx): # apply decoder cache reordering here return self.decoder._reorder_cache(past_key_values, beam_idx) From 4b85623a60e5667620ad16418a1f0d5ed4b724e9 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 20 Nov 2024 08:45:50 +0100 Subject: [PATCH 04/12] fix tests --- .../models/instructblipvideo/modular_instructblipvideo.py | 3 --- tests/test_modeling_common.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index b0dc8a215740..45edd515b38a 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -137,9 +137,6 @@ def __init__( text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" self.text_config = CONFIG_MAPPING[text_model_type](**text_config) - self.tie_word_embeddings = self.text_config.tie_word_embeddings - self.is_encoder_decoder = self.text_config.is_encoder_decoder - self.num_query_tokens = num_query_tokens self.video_token_index = video_token_index self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 94b5e175bf88..e54a194b13b7 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2228,7 +2228,7 @@ def test_load_save_without_tied_weights(self): def test_tied_weights_keys(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() - config.tie_word_embeddings = True + config.get_text_config().tie_word_embeddings = True for model_class in self.all_model_classes: model_tied = model_class(config) From bc028d010a374b14f443ff189b4a4499c65b0f74 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 20 Nov 2024 11:25:11 +0100 Subject: [PATCH 05/12] remove unnecessary overwrites --- .../modeling_new_task_model.py | 3 -- src/transformers/configuration_utils.py | 14 ++++++-- src/transformers/models/clip/modeling_clip.py | 4 +-- src/transformers/models/fuyu/modeling_fuyu.py | 9 ++--- .../grounding_dino/modeling_grounding_dino.py | 4 +-- .../models/idefics2/modeling_idefics2.py | 33 ------------------- .../models/idefics3/modeling_idefics3.py | 11 ------- .../models/llava/modeling_llava.py | 14 +++----- .../models/llava_next/modeling_llava_next.py | 15 ++------- .../modeling_llava_next_video.py | 13 ++------ .../modeling_llava_onevision.py | 7 ++-- .../models/mllama/modeling_mllama.py | 8 ++--- .../models/moshi/modeling_moshi.py | 5 +-- .../models/paligemma/modeling_paligemma.py | 4 --- .../qwen2_audio/modeling_qwen2_audio.py | 15 ++------- .../video_llava/modeling_video_llava.py | 14 ++------ .../models/vipllava/modeling_vipllava.py | 14 +++----- 17 files changed, 45 insertions(+), 142 deletions(-) diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 4556308f1ea0..7d1ff47182b7 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -249,9 +249,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - def _update_causal_mask( self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False ): diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 60f9f34cf861..7a9795cb7bb1 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -197,9 +197,17 @@ class PretrainedConfig(PushToHubMixin): _auto_class: Optional[str] = None def __setattr__(self, key, value): - if key in super().__getattribute__("attribute_map"): - key = super().__getattribute__("attribute_map")[key] - super().__setattr__(key, value) + if key in ["tie_word_embeddings", "vocab_size", "is_encoder_decoder", "id_decoder"]: + text_config = self.get_text_config() + if not isinstance(text_config, self.__class__): + # text_config.__setattr__(key, value) + super().__setattr__(key, value) + else: + super().__setattr__(key, value) + else: + if key in super().__getattribute__("attribute_map"): + key = super().__getattribute__("attribute_map")[key] + super().__setattr__(key, value) def __getattribute__(self, key): if key != "attribute_map" and key in super().__getattribute__("attribute_map"): diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 04a3a73de045..09cc09244113 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -1427,7 +1427,7 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel): def __init__(self, config: CLIPTextConfig): super().__init__(config) - text_model = CLIPTextModel._from_config(config, attn_implementation=config._attn_implementation) + text_model = CLIPTextModel._from_config(config) self.text_model = text_model.text_model self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) @@ -1508,7 +1508,7 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel): def __init__(self, config: CLIPVisionConfig): super().__init__(config) - vision_model = CLIPVisionModel._from_config(config, attn_implementation=config._attn_implementation) + vision_model = CLIPVisionModel._from_config(config) self.vision_model = vision_model.vision_model self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False) diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index c8c758e6888a..43dfc95589cf 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -151,9 +151,9 @@ def __init__(self, config: FuyuConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.text_config.vocab_size - self.language_model = AutoModelForCausalLM.from_config( - config.text_config, attn_implementation=config._attn_implementation - ) + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] self.vision_embed_tokens = nn.Linear( config.patch_size * config.patch_size * config.num_channels, config.hidden_size @@ -181,9 +181,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - def gather_continuous_embeddings( self, word_embeddings: torch.Tensor, diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index 9c01ce19f323..c912ad3d1a5d 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -2104,9 +2104,7 @@ def __init__(self, config: GroundingDinoConfig): ) # Create text backbone - self.text_backbone = AutoModel.from_config( - config.text_config, add_pooling_layer=False, attn_implementation=config._attn_implementation - ) + self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False) self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model) if config.embedding_init_target or not config.two_stage: diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 3d46c3bd82e7..d750eabc6ce7 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1286,13 +1286,6 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.text_model.set_input_embeddings(value) - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - model_embeds = self.text_model.resize_token_embeddings( - new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of - ) - self.config.text_config.vocab_size = model_embeds.num_embeddings - return model_embeds - def inputs_merger( self, input_ids: torch.LongTensor, @@ -1516,32 +1509,6 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - # model_embeds = self.model.resize_token_embeddings(new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of) - model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - if new_num_tokens is None and pad_to_multiple_of is None: - return model_embeds - - # Update base model and current model config - # Ignore copy - self.config.text_config.vocab_size = model_embeds.weight.shape[0] - self.vocab_size = self.config.text_config.vocab_size - - # Tie weights again if needed - self.tie_weights() - - return model_embeds - - def tie_weights(self): - """ - Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding. - """ - output_embeddings = self.get_output_embeddings() - input_embeddings = self.get_input_embeddings() - - if getattr(self.config, "tie_word_embeddings", True): - output_embeddings.weight = input_embeddings.weight - @add_start_docstrings_to_model_forward(IDEFICS2_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Idefics2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index 31d43948fbd5..e0bc59c71bef 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -1095,17 +1095,6 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings - # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.tie_weights - def tie_weights(self): - """ - Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding. - """ - output_embeddings = self.get_output_embeddings() - input_embeddings = self.get_input_embeddings() - - if getattr(self.config, "tie_word_embeddings", True): - output_embeddings.weight = input_embeddings.weight - @add_start_docstrings_to_model_forward(IDEFICS3_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Idefics3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index e8536ee50f94..bd253a96feae 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -239,6 +239,10 @@ def __init__(self, config: LlavaConfig): self.multi_modal_projector = LlavaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) + + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self.post_init() @@ -260,16 +264,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - # update vocab size - self.config.text_config.vocab_size = model_embeds.num_embeddings - self.vocab_size = model_embeds.num_embeddings - return model_embeds - def get_image_features( self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str ): diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 269663c7d614..95963295e29a 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -354,6 +354,9 @@ def __init__(self, config: LlavaNextConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides self.post_init() @@ -392,18 +395,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights - def tie_weights(self): - return self.language_model.tie_weights() - - # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - # update vocab size - self.config.text_config.vocab_size = model_embeds.num_embeddings - self.vocab_size = model_embeds.num_embeddings - return model_embeds - def _merge_input_ids_with_image_features( self, image_features, diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index b0a20d6c5ccd..0b2a6b7ce1a7 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -394,6 +394,9 @@ def __init__( self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides self.vision_resampler = LlavaNextVideoPooler(config) @@ -427,16 +430,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - # update vocab size - self.config.text_config.vocab_size = model_embeds.num_embeddings - self.vocab_size = model_embeds.num_embeddings - return model_embeds - def _merge_input_ids_with_image_features( self, image_features, diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 626db4d96aae..fa518c2b48a0 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -371,6 +371,9 @@ def __init__(self, config: LlavaOnevisionConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + self.post_init() # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_input_embeddings @@ -397,10 +400,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.tie_weights - def tie_weights(self): - return self.language_model.tie_weights() - def pack_image_features(self, image_features, image_sizes, image_newline=None, vision_aspect_ratio="anyres_max_9"): """ Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors. diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index 8ce6150a2fa2..424f8e23ad80 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1845,7 +1845,7 @@ def __init__(self, config): super().__init__(config.get_text_config()) self.text_config = config.get_text_config() self.vocab_size = self.text_config.vocab_size - self.model = MllamaTextModel._from_config(self.text_config, attn_implementation=config._attn_implementation) + self.model = MllamaTextModel._from_config(self.text_config) self.lm_head = nn.Linear(self.text_config.hidden_size, self.vocab_size, bias=False) self.post_init() @@ -1982,6 +1982,9 @@ def __init__(self, config: MllamaConfig): self.vision_model = MllamaVisionModel._from_config(config.vision_config) self.language_model = MllamaForCausalLM._from_config(config.text_config) + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + self.multi_modal_projector = nn.Linear( config.vision_config.vision_output_dim, config.text_config.hidden_size, @@ -2007,9 +2010,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - @add_start_docstrings_to_model_forward(MLLAMA_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class="MllamaConfig") def forward( diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 9975996d21d1..ad21c9a180f9 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -1866,12 +1866,9 @@ def __init__(self, config: MoshiConfig): self.embed_tokens = nn.ModuleList( [nn.Embedding(config.audio_vocab_size + 1, config.hidden_size) for _ in range(2 * config.num_codebooks)] ) - self.audio_encoder = AutoModel.from_config( - config.audio_encoder_config, attn_implementation=config._attn_implementation - ) + self.audio_encoder = AutoModel.from_config(config.audio_encoder_config) self.decoder = MoshiForCausalLM(config) - config.depth_decoder_config._attn_implementation_internal = config._attn_implementation self.depth_decoder = MoshiDepthDecoder(config.depth_decoder_config) self.num_codebooks = config.num_codebooks diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py index e198dab420ab..528ee66e1166 100644 --- a/src/transformers/models/paligemma/modeling_paligemma.py +++ b/src/transformers/models/paligemma/modeling_paligemma.py @@ -336,10 +336,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights with Llava->PaliGemma - def tie_weights(self): - return self.language_model.tie_weights() - def _update_causal_mask( self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False ): diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index ce0e427048cf..6f6a9f69af3e 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -857,6 +857,9 @@ def __init__(self, config: Qwen2AudioConfig): self.multi_modal_projector = Qwen2AudioMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides self.post_init() @@ -895,18 +898,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights - def tie_weights(self): - return self.language_model.tie_weights() - - # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - # update vocab size - self.config.text_config.vocab_size = model_embeds.num_embeddings - self.vocab_size = model_embeds.num_embeddings - return model_embeds - def _merge_input_ids_with_audio_features( self, audio_features, num_audio_tokens, inputs_embeds, input_ids, attention_mask, labels ): diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 30adcb6ab5c0..6fa724f743e3 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -242,6 +242,9 @@ def __init__(self, config: VideoLlavaConfig): self.multi_modal_projector = VideoLlavaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self.post_init() @@ -263,17 +266,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - # update vocab size - self.config.text_config.vocab_size = model_embeds.num_embeddings - self.config.vocab_size = model_embeds.num_embeddings - self.vocab_size = model_embeds.num_embeddings - return model_embeds - def _merge_input_ids_with_visual_features( self, visual_features, inputs_embeds, input_ids, attention_mask, labels, num_frames=1 ): diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index b45325d2194e..1d3b3cd46509 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -242,6 +242,10 @@ def __init__(self, config: VipLlavaConfig): self.multi_modal_projector = VipLlavaMultiModalProjector(config) self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) + + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self.post_init() @@ -263,16 +267,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: - model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) - # update vocab size - self.config.text_config.vocab_size = model_embeds.num_embeddings - self.vocab_size = model_embeds.num_embeddings - return model_embeds - # Ignore copy def get_image_features(self, pixel_values: torch.FloatTensor, vision_feature_layers: List[int]): """ From 4fc9f36d63f4ce1748e64f62d7d464eca90699ae Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 20 Nov 2024 11:49:36 +0100 Subject: [PATCH 06/12] fix --- src/transformers/configuration_utils.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 7a9795cb7bb1..60f9f34cf861 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -197,17 +197,9 @@ class PretrainedConfig(PushToHubMixin): _auto_class: Optional[str] = None def __setattr__(self, key, value): - if key in ["tie_word_embeddings", "vocab_size", "is_encoder_decoder", "id_decoder"]: - text_config = self.get_text_config() - if not isinstance(text_config, self.__class__): - # text_config.__setattr__(key, value) - super().__setattr__(key, value) - else: - super().__setattr__(key, value) - else: - if key in super().__getattribute__("attribute_map"): - key = super().__getattribute__("attribute_map")[key] - super().__setattr__(key, value) + if key in super().__getattribute__("attribute_map"): + key = super().__getattribute__("attribute_map")[key] + super().__setattr__(key, value) def __getattribute__(self, key): if key != "attribute_map" and key in super().__getattribute__("attribute_map"): From fbc343868c0c52d0ed797675a468722371fdda51 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 20 Nov 2024 12:52:31 +0100 Subject: [PATCH 07/12] decoder=True --- src/transformers/modeling_utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 7849dc52ca91..fd3e6b6c872f 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1843,7 +1843,7 @@ def tie_weights(self): If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the weights instead. """ - if getattr(self.config.get_text_config(), "tie_word_embeddings", True): + if getattr(self.config.get_text_config(decoder=True), "tie_word_embeddings", True): output_embeddings = self.get_output_embeddings() if output_embeddings is not None: self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) @@ -2085,7 +2085,10 @@ def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean new_num_tokens = new_embeddings.weight.shape[0] # if word embeddings are not tied, make sure that lm head is resized as well - if self.get_output_embeddings() is not None and not self.config.get_text_config().tie_word_embeddings: + if ( + self.get_output_embeddings() is not None + and not self.config.get_text_config(decoder=True).tie_word_embeddings + ): old_lm_head = self.get_output_embeddings() if isinstance(old_lm_head, torch.nn.Embedding): new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens, mean_resizing=mean_resizing) @@ -4443,8 +4446,8 @@ def _fix_key(key): not_initialized_submodules = set_initialized_submodules(model, _loaded_keys) # If we're about to tie the output embeds to the input embeds we don't need to init them if ( - hasattr(model.config.get_text_config(), "tie_word_embeddings") - and model.config.get_text_config().tie_word_embeddings + hasattr(model.config.get_text_config(decoder=True), "tie_word_embeddings") + and model.config.get_text_config(decoder=True).tie_word_embeddings ): output_embeddings = model.get_output_embeddings() if output_embeddings is not None: From 392026beec666e227c80ace183f6cdeec3919606 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 29 Nov 2024 14:39:12 +0100 Subject: [PATCH 08/12] clean pix2struct --- .../pix2struct/configuration_pix2struct.py | 43 ------------------- .../models/pix2struct/modeling_pix2struct.py | 8 ---- 2 files changed, 51 deletions(-) diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py index d74bb84ce6ab..44b5e34884ee 100644 --- a/src/transformers/models/pix2struct/configuration_pix2struct.py +++ b/src/transformers/models/pix2struct/configuration_pix2struct.py @@ -14,9 +14,6 @@ # limitations under the License. """Pix2Struct model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -143,26 +140,6 @@ def __init__( **kwargs, ) - @classmethod - def from_pretrained( - cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs - ) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs) - - # get the text config dict if we are loading from Pix2StructConfig - if config_dict.get("model_type") == "pix2struct": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Pix2StructVisionConfig(PretrainedConfig): r""" @@ -262,26 +239,6 @@ def __init__( self.relative_attention_max_distance = relative_attention_max_distance self.d_kv = d_kv - @classmethod - def from_pretrained( - cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs - ) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Pix2StructConfig - if config_dict.get("model_type") == "pix2struct": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Pix2StructConfig(PretrainedConfig): r""" diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 176dadd5b883..c6bf5486cec4 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -1733,14 +1733,6 @@ def get_output_embeddings(self) -> nn.Module: def set_output_embeddings(self, new_embeddings): self.decoder.set_output_embeddings(new_embeddings) - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: - model_embeds = self.decoder.resize_token_embeddings(new_num_tokens) - - # update vocab size - self.config.text_config.vocab_size = new_num_tokens - - return model_embeds - def get_decoder(self): return self.decoder From 03c26903b412e5426d3b0681281c0a707ecd09d8 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 13 Jan 2025 13:29:53 +0100 Subject: [PATCH 09/12] run-all From e9f44ea0e7e284ad594613ea21333894b3704b01 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 13 Jan 2025 15:14:58 +0100 Subject: [PATCH 10/12] forgot `_tied_weights_keys` when adding Emu3 --- src/transformers/models/emu3/modeling_emu3.py | 3 +++ src/transformers/models/emu3/modular_emu3.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 1ee883aa406d..3d73f89dbcac 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1788,9 +1788,12 @@ def forward( class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["text_model.lm_head.weight"] + def __init__(self, config): super().__init__(config) self.text_model = Emu3ForCausalLM._from_config(config.text_config) + self.vqmodel = Emu3VQVAE(config.vq_config) self.vocabulary_mapping = Emu3ImageVocabularyMapping(config.vocabulary_map) diff --git a/src/transformers/models/emu3/modular_emu3.py b/src/transformers/models/emu3/modular_emu3.py index e9b80d5cbb4d..da6016dc266b 100644 --- a/src/transformers/models/emu3/modular_emu3.py +++ b/src/transformers/models/emu3/modular_emu3.py @@ -1103,6 +1103,8 @@ def forward(**super_kwargs): class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): + _tied_weights_keys = ["text_model.lm_head.weight"] + def __init__(self, config): super().__init__(config) self.text_model = Emu3ForCausalLM._from_config(config.text_config) From b7e43f0ca958a9c1e97f6dee246643c44d461bec Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 13 Jan 2025 16:22:58 +0100 Subject: [PATCH 11/12] also Aria + fix-copies --- src/transformers/models/aria/modeling_aria.py | 1 + src/transformers/models/aria/modular_aria.py | 1 + src/transformers/models/emu3/modeling_emu3.py | 1 - 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index 12d7224b2105..dbbfde6e3065 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -1357,6 +1357,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): config_class = AriaConfig _supports_flash_attn_2 = False _supports_sdpa = False + _tied_weights_keys = ["language_model.lm_head.weight"] def __init__(self, config: AriaConfig): super().__init__(config) diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 78c6e08bdfd0..d6829b8d9b07 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -1337,6 +1337,7 @@ class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): config_class = AriaConfig _supports_flash_attn_2 = False _supports_sdpa = False + _tied_weights_keys = ["language_model.lm_head.weight"] def __init__(self, config: AriaConfig): super().__init__(config) diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 3d73f89dbcac..8983fc3b0fad 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -1793,7 +1793,6 @@ class Emu3ForConditionalGeneration(Emu3PreTrainedModel, GenerationMixin): def __init__(self, config): super().__init__(config) self.text_model = Emu3ForCausalLM._from_config(config.text_config) - self.vqmodel = Emu3VQVAE(config.vq_config) self.vocabulary_mapping = Emu3ImageVocabularyMapping(config.vocabulary_map) From 0ba9f37d9be26877de0a5cbd598f6a5ec7c02fd5 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 15 Jan 2025 09:15:17 +0100 Subject: [PATCH 12/12] and clean aria --- src/transformers/models/aria/modeling_aria.py | 3 --- src/transformers/models/aria/modular_aria.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index dbbfde6e3065..048d8de1ce35 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -1404,9 +1404,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - def get_image_features( self, pixel_values: torch.FloatTensor, diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index d6829b8d9b07..295e2dcb7465 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -1384,9 +1384,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - def get_image_features( self, pixel_values: torch.FloatTensor,