From 069cc2e598f5f3cc853df7c6c4f8a8983942e771 Mon Sep 17 00:00:00 2001 From: Yung-Sung Chuang Date: Sat, 30 Aug 2025 15:07:41 -0400 Subject: [PATCH 1/5] fix MetaCLIP 2 wrong link & wrong model names in the documentation and docstrings --- docs/source/en/model_doc/metaclip_2.md | 2 +- .../models/metaclip_2/modular_metaclip_2.py | 571 +++++++++++++++++- 2 files changed, 570 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/metaclip_2.md b/docs/source/en/model_doc/metaclip_2.md index 62d14fa66e18..b9fbba090f0a 100644 --- a/docs/source/en/model_doc/metaclip_2.md +++ b/docs/source/en/model_doc/metaclip_2.md @@ -32,7 +32,7 @@ MetaCLIP 2 is a replication of the original CLIP model trained on 300+ languages This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/MetaCLIP). -You can find all the MetaCLIP 2 checkpoints under the [Meta](https://huggingface.co/facebook?search_models=metaclip-2) organization. +You can find all the MetaCLIP 2 checkpoints under the [Meta](https://huggingface.co/facebook/models?search=metaclip-2) organization. > [!TIP] > Click on the MetaCLIP 2 models in the right sidebar for more examples of how to apply MetaCLIP 2 to different image and language tasks. diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index d4c259849e69..65dc0307ac9b 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -28,15 +28,175 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "facebook/metaclip-2-worldwide-huge-quickgelu" +_CONFIG_FOR_DOC = "MetaClip2Config" + + class MetaClip2TextConfig(CLIPTextConfig): + r""" + This is the configuration class to store the configuration of a [`MetaClip2TextModel`]. It is used to instantiate + a MetaClip2 text encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 49408): + Vocabulary size of the MetaClip2 text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`MetaClip2TextModel`]. + hidden_size (`int`, *optional*, defaults to 512): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + max_position_embeddings (`int`, *optional*, defaults to 77): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + pad_token_id (`int`, *optional*, defaults to 1): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 49406): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 49407): + End of stream token id. + + Example: + + ```python + >>> from transformers import MetaClip2TextConfig, MetaClip2TextModel + + >>> # Initializing a MetaClip2TextConfig with facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> configuration = MetaClip2TextConfig() + + >>> # Initializing a MetaClip2TextModel (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> model = MetaClip2TextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" pass class MetaClip2VisionConfig(CLIPVisionConfig): + r""" + This is the configuration class to store the configuration of a [`MetaClip2VisionModel`]. It is used to instantiate a MetaClip2 + vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the vision encoder of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + + Example: + + ```python + >>> from transformers import MetaClip2VisionConfig, MetaClip2VisionModel + + >>> # Initializing a MetaClip2VisionConfig with facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> configuration = MetaClip2VisionConfig() + + >>> # Initializing a MetaClip2VisionModel (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> model = MetaClip2VisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" pass class MetaClip2Config(CLIPConfig): + r""" + [`MetaClip2Config`] is the configuration class to store the configuration of a [`MetaClip2Model`]. It is used to + instantiate a MetaClip2 model according to the specified arguments, defining the text model and vision model configs. + Instantiating a configuration with the defaults will yield a similar configuration to that of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`MetaClip2TextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`MetaClip2VisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original MetaClip2 implementation. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import MetaClip2Config, MetaClip2Model + + >>> # Initializing a MetaClip2Config with facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> configuration = MetaClip2Config() + + >>> # Initializing a MetaClip2Model (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration + >>> model = MetaClip2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a MetaClip2Config from a MetaClip2TextConfig and a MetaClip2VisionConfig + >>> from transformers import MetaClip2TextConfig, MetaClip2VisionConfig + + >>> # Initializing a MetaClip2Text and MetaClip2Vision configuration + >>> config_text = MetaClip2TextConfig() + >>> config_vision = MetaClip2VisionConfig() + + >>> config = MetaClip2Config.from_text_vision_configs(config_text, config_vision) + ```""" pass @@ -175,14 +335,105 @@ def forward( class MetaClip2TextModel(CLIPTextModel): + """ + The text model from MetaClip2 without any head or projection on top. + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2TextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModel + + >>> model = MetaClip2TextModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + def __init__(self, config: MetaClip2TextConfig): super().__init__(config) self.text_model = MetaClip2TextTransformer(config) # Initialize weights and apply final processing self.post_init() + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ): + r""" + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModel + + >>> model = MetaClip2TextModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + class MetaClip2TextModelWithProjection(CLIPTextModelWithProjection): + """ + MetaClip2 text model with a projection layer on top (a linear layer on top of the pooled output). + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2TextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModelWithProjection + + >>> model = MetaClip2TextModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ```""" + def __init__(self, config: MetaClip2TextConfig): super().__init__(config) @@ -194,8 +445,74 @@ def __init__(self, config: MetaClip2TextConfig): # Initialize weights and apply final processing self.post_init() + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ): + r""" + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModelWithProjection + + >>> model = MetaClip2TextModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ```""" + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + class MetaClip2Model(CLIPModel): + """ + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + def __init__(self, config: MetaClip2Config): super().__init__(config) @@ -219,13 +536,263 @@ def __init__(self, config: MetaClip2Config): # Initialize weights and apply final processing self.post_init() + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Args: + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + return super().forward( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + return_loss=return_loss, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ): + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`MetaClip2TextModel`]. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) + ```""" + return super().get_text_features( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`MetaClip2VisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> image_features = model.get_image_features(**inputs) + ```""" + return super().get_image_features( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + class MetaClip2VisionModel(CLIPVisionModel): - pass + """ + The vision model from MetaClip2 without any head or projection on top. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2VisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModel + + >>> model = MetaClip2VisionModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModel + + >>> model = MetaClip2VisionModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + return super().forward( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) class MetaClip2VisionModelWithProjection(CLIPVisionModelWithProjection): - pass + """ + MetaClip2 vision model with a projection layer on top (a linear layer on top of the pooled output). + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2VisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModelWithProjection + + >>> model = MetaClip2VisionModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + ```""" + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModelWithProjection + + >>> model = MetaClip2VisionModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + ```""" + return super().forward( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + ) class MetaClip2ForImageClassification(CLIPForImageClassification): From 6abdf780df470fc8d3d798447280103d7c04cff9 Mon Sep 17 00:00:00 2001 From: Yung-Sung Chuang Date: Sat, 30 Aug 2025 15:35:36 -0400 Subject: [PATCH 2/5] ruff reformatted --- src/transformers/models/metaclip_2/modular_metaclip_2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index 65dc0307ac9b..fb80b06c9ac9 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -92,6 +92,7 @@ class MetaClip2TextConfig(CLIPTextConfig): >>> # Accessing the model configuration >>> configuration = model.config ```""" + pass @@ -149,6 +150,7 @@ class MetaClip2VisionConfig(CLIPVisionConfig): >>> # Accessing the model configuration >>> configuration = model.config ```""" + pass @@ -197,6 +199,7 @@ class MetaClip2Config(CLIPConfig): >>> config = MetaClip2Config.from_text_vision_configs(config_text, config_vision) ```""" + pass From 356e2276f5d38185eb6a473e541e19efbdaca4f1 Mon Sep 17 00:00:00 2001 From: Yung-Sung Chuang Date: Sat, 30 Aug 2025 20:25:51 -0400 Subject: [PATCH 3/5] update files generated by modular --- .../metaclip_2/configuration_metaclip_2.py | 50 ++-- .../models/metaclip_2/modeling_metaclip_2.py | 224 +++++++++++++++--- 2 files changed, 220 insertions(+), 54 deletions(-) diff --git a/src/transformers/models/metaclip_2/configuration_metaclip_2.py b/src/transformers/models/metaclip_2/configuration_metaclip_2.py index 32b21a193e8d..070101569384 100644 --- a/src/transformers/models/metaclip_2/configuration_metaclip_2.py +++ b/src/transformers/models/metaclip_2/configuration_metaclip_2.py @@ -14,18 +14,18 @@ class MetaClip2TextConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`MetaClip2TextModel`]. It is used to instantiate a METACLIP_2 - text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the text encoder of the METACLIP_2 - [openai/metaclip_2-vit-base-patch32](https://huggingface.co/openai/metaclip_2-vit-base-patch32) architecture. + This is the configuration class to store the configuration of a [`MetaClip2TextModel`]. It is used to instantiate + a MetaClip2 text encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vocab_size (`int`, *optional*, defaults to 49408): - Vocabulary size of the METACLIP_2 text model. Defines the number of different tokens that can be represented by - the `inputs_ids` passed when calling [`MetaClip2Model`]. + Vocabulary size of the MetaClip2 text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`MetaClip2TextModel`]. hidden_size (`int`, *optional*, defaults to 512): Dimensionality of the encoder layers and the pooler layer. intermediate_size (`int`, *optional*, defaults to 2048): @@ -63,17 +63,17 @@ class MetaClip2TextConfig(PretrainedConfig): ```python >>> from transformers import MetaClip2TextConfig, MetaClip2TextModel - >>> # Initializing a MetaClip2TextConfig with openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2TextConfig with facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> configuration = MetaClip2TextConfig() - >>> # Initializing a MetaClip2TextModel (with random weights) from the openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2TextModel (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> model = MetaClip2TextModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" - model_type = "metaclip_2_text_model" + model_type = "meta_clip2_text_model" base_config_key = "text_config" def __init__( @@ -90,7 +90,7 @@ def __init__( attention_dropout=0.0, initializer_range=0.02, initializer_factor=1.0, - # This differs from `MetaClip2Tokenizer`'s default and from openai/metaclip_2 + # This differs from `MetaClip2Tokenizer`'s default and from openai/meta_clip2 # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 pad_token_id=1, bos_token_id=49406, @@ -115,10 +115,10 @@ def __init__( class MetaClip2VisionConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`MetaClip2VisionModel`]. It is used to instantiate a - METACLIP_2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the vision encoder of the METACLIP_2 - [openai/metaclip_2-vit-base-patch32](https://huggingface.co/openai/metaclip_2-vit-base-patch32) architecture. + This is the configuration class to store the configuration of a [`MetaClip2VisionModel`]. It is used to instantiate a MetaClip2 + vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the vision encoder of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -158,17 +158,17 @@ class MetaClip2VisionConfig(PretrainedConfig): ```python >>> from transformers import MetaClip2VisionConfig, MetaClip2VisionModel - >>> # Initializing a MetaClip2VisionConfig with openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2VisionConfig with facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> configuration = MetaClip2VisionConfig() - >>> # Initializing a MetaClip2VisionModel (with random weights) from the openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2VisionModel (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> model = MetaClip2VisionModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" - model_type = "metaclip_2_vision_model" + model_type = "meta_clip2_vision_model" base_config_key = "vision_config" def __init__( @@ -207,10 +207,10 @@ def __init__( class MetaClip2Config(PretrainedConfig): r""" - [`MetaClip2Config`] is the configuration class to store the configuration of a [`MetaClip2Model`]. It is used to instantiate - a METACLIP_2 model according to the specified arguments, defining the text model and vision model configs. Instantiating - a configuration with the defaults will yield a similar configuration to that of the METACLIP_2 - [openai/metaclip_2-vit-base-patch32](https://huggingface.co/openai/metaclip_2-vit-base-patch32) architecture. + [`MetaClip2Config`] is the configuration class to store the configuration of a [`MetaClip2Model`]. It is used to + instantiate a MetaClip2 model according to the specified arguments, defining the text model and vision model configs. + Instantiating a configuration with the defaults will yield a similar configuration to that of the MetaClip2 + [facebook/metaclip-2-worldwide-huge-quickgelu](https://huggingface.co/facebook/metaclip-2-worldwide-huge-quickgelu) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. @@ -223,7 +223,7 @@ class MetaClip2Config(PretrainedConfig): projection_dim (`int`, *optional*, defaults to 512): Dimensionality of text and vision projection layers. logit_scale_init_value (`float`, *optional*, defaults to 2.6592): - The initial value of the *logit_scale* parameter. Default is used as per the original METACLIP_2 implementation. + The initial value of the *logit_scale* parameter. Default is used as per the original MetaClip2 implementation. kwargs (*optional*): Dictionary of keyword arguments. @@ -232,10 +232,10 @@ class MetaClip2Config(PretrainedConfig): ```python >>> from transformers import MetaClip2Config, MetaClip2Model - >>> # Initializing a MetaClip2Config with openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2Config with facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> configuration = MetaClip2Config() - >>> # Initializing a MetaClip2Model (with random weights) from the openai/metaclip_2-vit-base-patch32 style configuration + >>> # Initializing a MetaClip2Model (with random weights) from the facebook/metaclip-2-worldwide-huge-quickgelu style configuration >>> model = MetaClip2Model(configuration) >>> # Accessing the model configuration @@ -251,7 +251,7 @@ class MetaClip2Config(PretrainedConfig): >>> config = MetaClip2Config.from_text_vision_configs(config_text, config_vision) ```""" - model_type = "metaclip_2" + model_type = "meta_clip2" sub_configs = {"text_config": MetaClip2TextConfig, "vision_config": MetaClip2VisionConfig} def __init__( diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index 56ab43c03010..4ba46c10d873 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -213,7 +213,7 @@ def forward( queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) - # METACLIP_2 text model uses both `causal_attention_mask` and `attention_mask` + # META_CLIP2 text model uses both `causal_attention_mask` and `attention_mask` # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask` if self.config._attn_implementation == "flash_attention_2": self.is_causal = causal_attention_mask is not None @@ -538,14 +538,43 @@ def forward( @auto_docstring( custom_intro=""" - The text model from METACLIP_2 without any head or projection on top. + The text model from META_CLIP2 without any head or projection on top. """ ) class MetaClip2TextModel(MetaClip2PreTrainedModel): + """ + The text model from MetaClip2 without any head or projection on top. + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2TextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModel + + >>> model = MetaClip2TextModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + config: MetaClip2TextConfig _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"] - _supports_flash_attn = False # mask creation only accounts for sdpa/eager def __init__(self, config: MetaClip2TextConfig): super().__init__(config) @@ -575,8 +604,8 @@ def forward( ```python >>> from transformers import AutoTokenizer, MetaClip2TextModel - >>> model = MetaClip2TextModel.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> tokenizer = AutoTokenizer.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2TextModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") @@ -614,6 +643,36 @@ class MetaClip2TextModelOutput(ModelOutput): @auto_docstring class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel): + """ + MetaClip2 text model with a projection layer on top (a linear layer on top of the pooled output). + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2TextConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, MetaClip2TextModelWithProjection + + >>> model = MetaClip2TextModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + ```""" + config: MetaClip2TextConfig _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"] @@ -651,8 +710,8 @@ def forward( ```python >>> from transformers import AutoTokenizer, MetaClip2TextModelWithProjection - >>> model = MetaClip2TextModelWithProjection.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> tokenizer = AutoTokenizer.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2TextModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") @@ -716,12 +775,12 @@ def to_tuple(self) -> tuple[Any]: # contrastive loss function, adapted from -# https://sachinruk.github.io/blog/2021-03-07-metaclip_2.html +# https://sachinruk.github.io/blog/2021-03-07-meta_clip2.html def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) -def metaclip_2_loss(similarity: torch.Tensor) -> torch.Tensor: +def meta_clip2_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 @@ -740,9 +799,44 @@ def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor: @auto_docstring class MetaClip2Model(MetaClip2PreTrainedModel): + """ + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2Model + + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + config: MetaClip2Config _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer", "MetaClip2VisionEmbeddings"] - _supports_flash_attn = False # mask creation only accounts for sdpa/eager def __init__(self, config: MetaClip2Config): super().__init__(config) @@ -798,13 +892,13 @@ def get_text_features( ```python >>> from transformers import AutoTokenizer, MetaClip2Model - >>> model = MetaClip2Model.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> tokenizer = AutoTokenizer.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use METACLIP_2 model's config for some fields (if specified) instead of those of vision & text components. + # Use META_CLIP2 model's config for some fields (if specified) instead of those of vision & text components. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -843,8 +937,8 @@ def get_image_features( >>> import requests >>> from transformers import AutoProcessor, MetaClip2Model - >>> model = MetaClip2Model.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> processor = AutoProcessor.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) @@ -853,7 +947,7 @@ def get_image_features( >>> image_features = model.get_image_features(**inputs) ```""" - # Use METACLIP_2 model's config for some fields (if specified) instead of those of vision & text components. + # Use META_CLIP2 model's config for some fields (if specified) instead of those of vision & text components. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -885,8 +979,9 @@ def forward( interpolate_pos_encoding: bool = False, ) -> MetaClip2Output: r""" - return_loss (`bool`, *optional*): - Whether or not to return the contrastive loss. + Args: + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. Examples: @@ -895,8 +990,8 @@ def forward( >>> import requests >>> from transformers import AutoProcessor, MetaClip2Model - >>> model = MetaClip2Model.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> processor = AutoProcessor.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2Model.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) @@ -909,7 +1004,7 @@ def forward( >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities ```""" - # Use METACLIP_2 model's config for some fields (if specified) instead of those of vision & text components. + # Use META_CLIP2 model's config for some fields (if specified) instead of those of vision & text components. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -948,7 +1043,7 @@ def forward( loss = None if return_loss: - loss = metaclip_2_loss(logits_per_text) + loss = meta_clip2_loss(logits_per_text) return MetaClip2Output( loss=loss, @@ -1011,10 +1106,46 @@ def forward( @auto_docstring( custom_intro=""" - The vision model from METACLIP_2 without any head or projection on top. + The vision model from META_CLIP2 without any head or projection on top. """ ) class MetaClip2VisionModel(MetaClip2PreTrainedModel): + """ + The vision model from MetaClip2 without any head or projection on top. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2VisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModel + + >>> model = MetaClip2VisionModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + config: MetaClip2VisionConfig main_input_name = "pixel_values" _no_split_modules = ["MetaClip2EncoderLayer"] @@ -1038,15 +1169,15 @@ def forward( interpolate_pos_encoding: bool = False, ) -> BaseModelOutputWithPooling: r""" - Example: + Examples: ```python >>> from PIL import Image >>> import requests >>> from transformers import AutoProcessor, MetaClip2VisionModel - >>> model = MetaClip2VisionModel.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> processor = AutoProcessor.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2VisionModel.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) @@ -1086,6 +1217,41 @@ class MetaClip2VisionModelOutput(ModelOutput): @auto_docstring class MetaClip2VisionModelWithProjection(MetaClip2PreTrainedModel): + """ + MetaClip2 vision model with a projection layer on top (a linear layer on top of the pooled output). + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`MetaClip2VisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, MetaClip2VisionModelWithProjection + + >>> model = MetaClip2VisionModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + ```""" + config: MetaClip2VisionConfig main_input_name = "pixel_values" @@ -1120,8 +1286,8 @@ def forward( >>> import requests >>> from transformers import AutoProcessor, MetaClip2VisionModelWithProjection - >>> model = MetaClip2VisionModelWithProjection.from_pretrained("openai/metaclip_2-vit-base-patch32") - >>> processor = AutoProcessor.from_pretrained("openai/metaclip_2-vit-base-patch32") + >>> model = MetaClip2VisionModelWithProjection.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") + >>> processor = AutoProcessor.from_pretrained("facebook/metaclip-2-worldwide-huge-quickgelu") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) @@ -1151,7 +1317,7 @@ def forward( @auto_docstring( custom_intro=""" - METACLIP_2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of + META_CLIP2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of the patch tokens) e.g. for ImageNet. """ ) From 4c359ccf517683868dd01e6af9e112fe4308a63c Mon Sep 17 00:00:00 2001 From: Yung-Sung Chuang Date: Sat, 30 Aug 2025 20:45:50 -0400 Subject: [PATCH 4/5] update meta_clip2 to metaclip_2 to match the original --- .../metaclip_2/configuration_metaclip_2.py | 8 ++++---- .../models/metaclip_2/modeling_metaclip_2.py | 20 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/metaclip_2/configuration_metaclip_2.py b/src/transformers/models/metaclip_2/configuration_metaclip_2.py index 070101569384..a0cec0f3c5b3 100644 --- a/src/transformers/models/metaclip_2/configuration_metaclip_2.py +++ b/src/transformers/models/metaclip_2/configuration_metaclip_2.py @@ -73,7 +73,7 @@ class MetaClip2TextConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "meta_clip2_text_model" + model_type = "metaclip_2_text_model" base_config_key = "text_config" def __init__( @@ -90,7 +90,7 @@ def __init__( attention_dropout=0.0, initializer_range=0.02, initializer_factor=1.0, - # This differs from `MetaClip2Tokenizer`'s default and from openai/meta_clip2 + # This differs from `MetaClip2Tokenizer`'s default and from openai/metaclip_2 # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538 pad_token_id=1, bos_token_id=49406, @@ -168,7 +168,7 @@ class MetaClip2VisionConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "meta_clip2_vision_model" + model_type = "metaclip_2_vision_model" base_config_key = "vision_config" def __init__( @@ -251,7 +251,7 @@ class MetaClip2Config(PretrainedConfig): >>> config = MetaClip2Config.from_text_vision_configs(config_text, config_vision) ```""" - model_type = "meta_clip2" + model_type = "metaclip_2" sub_configs = {"text_config": MetaClip2TextConfig, "vision_config": MetaClip2VisionConfig} def __init__( diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index 4ba46c10d873..bfe784572823 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -213,7 +213,7 @@ def forward( queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2) - # META_CLIP2 text model uses both `causal_attention_mask` and `attention_mask` + # METACLIP_2 text model uses both `causal_attention_mask` and `attention_mask` # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask` if self.config._attn_implementation == "flash_attention_2": self.is_causal = causal_attention_mask is not None @@ -538,7 +538,7 @@ def forward( @auto_docstring( custom_intro=""" - The text model from META_CLIP2 without any head or projection on top. + The text model from METACLIP_2 without any head or projection on top. """ ) class MetaClip2TextModel(MetaClip2PreTrainedModel): @@ -775,12 +775,12 @@ def to_tuple(self) -> tuple[Any]: # contrastive loss function, adapted from -# https://sachinruk.github.io/blog/2021-03-07-meta_clip2.html +# https://sachinruk.github.io/blog/2021-03-07-metaclip_2.html def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) -def meta_clip2_loss(similarity: torch.Tensor) -> torch.Tensor: +def metaclip_2_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 @@ -898,7 +898,7 @@ def get_text_features( >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use META_CLIP2 model's config for some fields (if specified) instead of those of vision & text components. + # Use METACLIP_2 model's config for some fields (if specified) instead of those of vision & text components. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -947,7 +947,7 @@ def get_image_features( >>> image_features = model.get_image_features(**inputs) ```""" - # Use META_CLIP2 model's config for some fields (if specified) instead of those of vision & text components. + # Use METACLIP_2 model's config for some fields (if specified) instead of those of vision & text components. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1004,7 +1004,7 @@ def forward( >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities ```""" - # Use META_CLIP2 model's config for some fields (if specified) instead of those of vision & text components. + # Use METACLIP_2 model's config for some fields (if specified) instead of those of vision & text components. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1043,7 +1043,7 @@ def forward( loss = None if return_loss: - loss = meta_clip2_loss(logits_per_text) + loss = metaclip_2_loss(logits_per_text) return MetaClip2Output( loss=loss, @@ -1106,7 +1106,7 @@ def forward( @auto_docstring( custom_intro=""" - The vision model from META_CLIP2 without any head or projection on top. + The vision model from METACLIP_2 without any head or projection on top. """ ) class MetaClip2VisionModel(MetaClip2PreTrainedModel): @@ -1317,7 +1317,7 @@ def forward( @auto_docstring( custom_intro=""" - META_CLIP2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of + METACLIP_2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of the patch tokens) e.g. for ImageNet. """ ) From 97730a3f5f9d63b0053e36159d5777a36f3d0cfa Mon Sep 17 00:00:00 2001 From: Yung-Sung Chuang Date: Sat, 30 Aug 2025 20:57:12 -0400 Subject: [PATCH 5/5] _supports_flash_attn = False --- src/transformers/models/metaclip_2/modeling_metaclip_2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py index bfe784572823..e326cc954987 100644 --- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py @@ -575,6 +575,7 @@ class MetaClip2TextModel(MetaClip2PreTrainedModel): config: MetaClip2TextConfig _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"] + _supports_flash_attn = False # mask creation only accounts for sdpa/eager def __init__(self, config: MetaClip2TextConfig): super().__init__(config) @@ -837,6 +838,7 @@ class MetaClip2Model(MetaClip2PreTrainedModel): config: MetaClip2Config _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer", "MetaClip2VisionEmbeddings"] + _supports_flash_attn = False # mask creation only accounts for sdpa/eager def __init__(self, config: MetaClip2Config): super().__init__(config)