From 62174bd0127a3df60de0fcd073e5d6248247c8ad Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Mon, 4 Dec 2023 21:45:07 +0000
Subject: [PATCH 01/23] Add support for siglip models

---
 docs/source/exporters/onnx/overview.mdx |  1 +
 optimum/exporters/onnx/model_configs.py | 19 +++++++++++++++++++
 optimum/exporters/tasks.py              | 13 +++++++++++++
 optimum/utils/normalized_config.py      |  1 +
 tests/exporters/exporters_utils.py      |  1 +
 transformers                            |  1 +
 6 files changed, 36 insertions(+)
 create mode 160000 transformers

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 0ea17b6afe..2d84e376c6 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -83,6 +83,7 @@ Supported architectures:
 - SEW
 - SEW-D
 - Speech2Text
+- SigLIP
 - SpeechT5
 - Splinter
 - SqueezeBert
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index f4d50ad58d..d7c019f9d3 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -806,6 +806,7 @@ class CLIPOnnxConfig(TextAndVisionOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
+        print('1 get inputs')
         return {
             "input_ids": {0: "text_batch_size", 1: "sequence_length"},
             "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"},
@@ -836,8 +837,10 @@ class CLIPTextWithProjectionOnnxConfig(TextEncoderOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
+        print('2 get inputs')
         return {
             "input_ids": {0: "batch_size", 1: "sequence_length"},
+            "attention_mask": {0: "text_batch_size", 1: "sequence_length"},
         }
 
     @property
@@ -876,6 +879,22 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         return dummy_inputs
 
 
+class SiglipNormalizedConfig(CLIPNormalizedConfig):
+    pass
+
+
+class SiglipOnnxConfig(CLIPOnnxConfig):
+    pass
+
+
+class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig):
+    pass
+
+
+class SiglipTextOnnxConfig(CLIPTextOnnxConfig):
+    pass
+
+
 class UNetOnnxConfig(VisionOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-3
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 7545c72d6c..e7ef41d99b 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -864,6 +864,19 @@ class TasksManager:
             "audio-classification",
             onnx="SEWDOnnxConfig",
         ),
+        "siglip": supported_tasks_mapping(
+            "feature-extraction",
+            "zero-shot-image-classification",
+            onnx="SiglipOnnxConfig",
+        ),
+        "siglip-text-model": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="SiglipTextOnnxConfig",
+        ),
+        "siglip-text-with-projection": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="SiglipTextWithProjectionOnnxConfig",
+        ),
         "speech-to-text": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
index 7a0af9a1a4..e98586e5bc 100644
--- a/optimum/utils/normalized_config.py
+++ b/optimum/utils/normalized_config.py
@@ -201,6 +201,7 @@ class NormalizedConfigManager:
         'perceiver',
         'roformer',
         'segformer',
+        'siglip',
         'squeezebert',
     """
 
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 6e43b65e34..abd570a22a 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -129,6 +129,7 @@
     "roformer": "hf-internal-testing/tiny-random-RoFormerModel",
     # "sam": "fxmarty/sam-vit-tiny-random",  # TODO: re-enable once PyTorch 2.1 is released, see https://github.com/huggingface/optimum/pull/1301
     "segformer": "hf-internal-testing/tiny-random-SegformerModel",
+    "siglip": "HuggingFaceM4/tiny-random-siglip",
     "splinter": "hf-internal-testing/tiny-random-SplinterModel",
     "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
diff --git a/transformers b/transformers
new file mode 160000
index 0000000000..e2e6dc9a6d
--- /dev/null
+++ b/transformers
@@ -0,0 +1 @@
+Subproject commit e2e6dc9a6dfb4665e41a084c45f4e5a34ea32a14

From 3860ac39ea7468cb97be2e76d46a0ad4ab53a3ec Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Mon, 4 Dec 2023 21:48:51 +0000
Subject: [PATCH 02/23] cleanup

---
 optimum/exporters/onnx/model_configs.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index d7c019f9d3..66596a1bf7 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -806,7 +806,6 @@ class CLIPOnnxConfig(TextAndVisionOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
-        print('1 get inputs')
         return {
             "input_ids": {0: "text_batch_size", 1: "sequence_length"},
             "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"},
@@ -837,10 +836,8 @@ class CLIPTextWithProjectionOnnxConfig(TextEncoderOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
-        print('2 get inputs')
         return {
             "input_ids": {0: "batch_size", 1: "sequence_length"},
-            "attention_mask": {0: "text_batch_size", 1: "sequence_length"},
         }
 
     @property

From 3e235380803ccc223e688b4e0ef7a1685b46554f Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Mon, 4 Dec 2023 21:50:19 +0000
Subject: [PATCH 03/23] remove submodule

---
 transformers | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 transformers

diff --git a/transformers b/transformers
deleted file mode 160000
index e2e6dc9a6d..0000000000
--- a/transformers
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit e2e6dc9a6dfb4665e41a084c45f4e5a34ea32a14

From be9c70790036f6d7111abbe1ed383e232c667689 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Sat, 9 Dec 2023 17:03:08 +0000
Subject: [PATCH 04/23] Add ONNX export for DinoV2 models

---
 docs/source/exporters/onnx/overview.mdx      | 1 +
 optimum/exporters/onnx/model_configs.py      | 4 ++++
 optimum/exporters/tasks.py                   | 5 +++++
 optimum/onnxruntime/modeling_ort.py          | 2 +-
 optimum/utils/normalized_config.py           | 1 +
 tests/exporters/exporters_utils.py           | 1 +
 tests/onnxruntime/test_modeling.py           | 1 +
 tests/onnxruntime/utils_onnxruntime_tests.py | 1 +
 8 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 0a5da755a3..f6bd06cde1 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -38,6 +38,7 @@ Supported architectures:
 - Deberta-v2
 - Deit
 - Detr
+- DINOv2
 - DistilBert
 - Donut-Swin
 - Electra
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index a58f42dca4..2f834f64fd 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -708,6 +708,10 @@ class ConvNextV2OnnxConfig(ViTOnnxConfig):
     pass
 
 
+class Dinov2OnnxConfig(ViTOnnxConfig):
+    pass
+
+
 class MobileViTOnnxConfig(ViTOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-4
 
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 4d3f9f98d0..91025b6d2d 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -482,6 +482,11 @@ class TasksManager:
             "image-segmentation",
             onnx="DetrOnnxConfig",
         ),
+        "dinov2": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            onnx="Dinov2OnnxConfig",
+        ),
         "distilbert": supported_tasks_mapping(
             "feature-extraction",
             "fill-mask",
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index eb9b540480..8d54c9a0f9 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -1534,7 +1534,7 @@ def forward(
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForImageClassification(ORTModel):
     """
-    ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, vit.
+    ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, dinov2, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, vit.
     """
 
     auto_model_class = AutoModelForImageClassification
diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
index 6fa4adcecf..38d63cf782 100644
--- a/optimum/utils/normalized_config.py
+++ b/optimum/utils/normalized_config.py
@@ -190,6 +190,7 @@ class NormalizedConfigManager:
         'data2vec-text',
         'data2vec-vision',
         'detr',
+        'dinov2',
         'flaubert',
         'groupvit',
         'ibert',
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 9af7806e7f..7ca67d8f05 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -62,6 +62,7 @@
     "deberta": "hf-internal-testing/tiny-random-DebertaModel",
     "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model",
     "deit": "hf-internal-testing/tiny-random-DeiTModel",
+    "dinov2": "hf-internal-testing/tiny-random-Dinov2Model",
     "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder",
     "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel",
     "detr": "hf-internal-testing/tiny-random-DetrModel",  # hf-internal-testing/tiny-random-detr is larger
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index bb06e42157..81e1cb3149 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -2702,6 +2702,7 @@ class ORTModelForImageClassificationIntegrationTest(ORTModelTestMixin):
         "convnextv2",
         "data2vec_vision",
         "deit",
+        "dinov2",
         "levit",
         "mobilenet_v1",
         "mobilenet_v2",
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 8e579879ea..70be094fec 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -50,6 +50,7 @@
     "deit": "hf-internal-testing/tiny-random-DeiTModel",
     "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder",
     "detr": "hf-internal-testing/tiny-random-detr",
+    "dinov2": "hf-internal-testing/tiny-random-Dinov2Model",
     "distilbert": "hf-internal-testing/tiny-random-DistilBertModel",
     "electra": "hf-internal-testing/tiny-random-ElectraModel",
     "encoder-decoder": {

From c4d6bc27f1b4f18252fae2151e8f7a7c48668027 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Sat, 9 Dec 2023 19:42:40 +0000
Subject: [PATCH 05/23] Use height and width from preprocessor

---
 optimum/exporters/onnx/model_configs.py | 37 ++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 2f834f64fd..776a1f8506 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -708,8 +708,43 @@ class ConvNextV2OnnxConfig(ViTOnnxConfig):
     pass
 
 
+class Dinov2DummyInputGenerator(DummyVisionInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            num_channels=num_channels,
+            width=width,
+            height=height,
+            **kwargs,
+        )
+
+        from transformers.onnx.utils import get_preprocessor
+
+        preprocessor = get_preprocessor(normalized_config._name_or_path)
+        if preprocessor is not None and hasattr(preprocessor, "crop_size"):
+            self.height = preprocessor.crop_size.get("height", self.height)
+            self.width = preprocessor.crop_size.get("width", self.width)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        input_ = super().generate(
+            input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype
+        )
+        return input_
+
+
 class Dinov2OnnxConfig(ViTOnnxConfig):
-    pass
+    DUMMY_INPUT_GENERATOR_CLASSES = (Dinov2DummyInputGenerator, )
 
 
 class MobileViTOnnxConfig(ViTOnnxConfig):

From cb8d362047d8e45c36561254ebff83bd633f2fb6 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Sat, 9 Dec 2023 19:48:11 +0000
Subject: [PATCH 06/23] formatting

---
 optimum/exporters/onnx/model_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 776a1f8506..a8fa6c1e01 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -744,7 +744,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 
 class Dinov2OnnxConfig(ViTOnnxConfig):
-    DUMMY_INPUT_GENERATOR_CLASSES = (Dinov2DummyInputGenerator, )
+    DUMMY_INPUT_GENERATOR_CLASSES = (Dinov2DummyInputGenerator,)
 
 
 class MobileViTOnnxConfig(ViTOnnxConfig):

From 94c332905cf9f0c3318cddce8e0ea1d3919be89e Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Sat, 23 Dec 2023 23:12:18 +0000
Subject: [PATCH 07/23] Remove attention mask from model input

---
 optimum/exporters/onnx/model_configs.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 66596a1bf7..761fceb6f2 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -881,7 +881,16 @@ class SiglipNormalizedConfig(CLIPNormalizedConfig):
 
 
 class SiglipOnnxConfig(CLIPOnnxConfig):
-    pass
+    NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig
+    DEFAULT_ONNX_OPSET = 13
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "input_ids": {0: "text_batch_size", 1: "sequence_length"},
+            "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            # NOTE: No attention_mask
+        }
 
 
 class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig):

From 8d4b09e1aeb42c02edf01450f3b7be9db171bc9d Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Thu, 29 Aug 2024 00:52:03 +0000
Subject: [PATCH 08/23] Add ONNX export support for Hiera models

---
 docs/source/exporters/onnx/overview.mdx      | 1 +
 optimum/exporters/onnx/model_configs.py      | 4 ++++
 optimum/exporters/tasks.py                   | 5 +++++
 optimum/utils/normalized_config.py           | 1 +
 tests/exporters/exporters_utils.py           | 2 ++
 tests/onnxruntime/utils_onnxruntime_tests.py | 1 +
 6 files changed, 14 insertions(+)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 747e1396fb..11d0bc4a92 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -52,6 +52,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - GPT-NeoX
 - OPT
 - GroupVit
+- Hiera
 - Hubert
 - IBert
 - LayoutLM
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 3e11c7e614..81de33116f 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -772,6 +772,10 @@ class ConvNextV2OnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
+class HieraOnnxConfig(ViTOnnxConfig):
+    DEFAULT_ONNX_OPSET = 11
+
+
 class MobileViTOnnxConfig(ViTOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-4
     DEFAULT_ONNX_OPSET = 11
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index f02f176923..01270f0b40 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -705,6 +705,11 @@ class TasksManager:
             "feature-extraction",
             onnx="GroupViTOnnxConfig",
         ),
+        "hiera": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            onnx="HieraOnnxConfig",
+        ),
         "hubert": supported_tasks_mapping(
             "feature-extraction",
             "automatic-speech-recognition",
diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
index 81207b7649..64dc0d7cc9 100644
--- a/optimum/utils/normalized_config.py
+++ b/optimum/utils/normalized_config.py
@@ -206,6 +206,7 @@ class NormalizedConfigManager:
         'detr',
         'flaubert',
         'groupvit',
+        'hiera',
         'ibert',
         'layoutlm',
         'layoutlmv3',
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index a55c7a124d..2af51fc183 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -101,6 +101,7 @@
     "gpt-neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
     "gptj": "hf-internal-testing/tiny-random-GPTJModel",
     "groupvit": "hf-internal-testing/tiny-random-groupvit",
+    "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",
     "imagegpt": "hf-internal-testing/tiny-random-ImageGPTModel",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
@@ -231,6 +232,7 @@
     "gpt-neox": "EleutherAI/gpt-neox-20b",
     "gptj": "anton-l/gpt-j-tiny-random",  # TODO
     "groupvit": "nvidia/groupvit-gcc-yfcc",
+    "hiera": "facebook/hiera-tiny-224-in1k-hf",
     "ibert": "kssteven/ibert-roberta-base",
     "imagegpt": "openai/imagegpt-small",
     "levit": "facebook/levit-128S",
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index bb6935461d..3dc6be1909 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -105,6 +105,7 @@
     "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
     "gptj": "hf-internal-testing/tiny-random-GPTJForCausalLM",
     "groupvit": "hf-internal-testing/tiny-random-groupvit",
+    "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification",
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",
     "levit": "hf-internal-testing/tiny-random-LevitModel",

From b96bb6184523d63d62bd5a776264edf452841753 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Thu, 29 Aug 2024 13:56:07 +0000
Subject: [PATCH 09/23] Add ONNX export support for SwinV2

---
 docs/source/exporters/onnx/overview.mdx      | 1 +
 optimum/exporters/onnx/model_configs.py      | 4 ++++
 optimum/exporters/tasks.py                   | 6 ++++++
 optimum/onnxruntime/modeling_ort.py          | 2 +-
 optimum/onnxruntime/utils.py                 | 1 +
 tests/exporters/exporters_utils.py           | 2 ++
 tests/onnxruntime/utils_onnxruntime_tests.py | 1 +
 7 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 11d0bc4a92..908d08b6f3 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -95,6 +95,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - Splinter
 - SqueezeBert
 - Swin
+- SwinV2
 - T5
 - Table Transformer
 - TROCR
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 81de33116f..7670f95b8e 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -817,6 +817,10 @@ class SwinOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
+class SwinV2OnnxConfig(SwinOnnxConfig):
+    pass
+
+
 class Swin2srOnnxConfig(SwinOnnxConfig):
     pass
 
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 01270f0b40..b771eb731f 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1054,6 +1054,12 @@ class TasksManager:
             "masked-im",
             onnx="SwinOnnxConfig",
         ),
+        "swinv2": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            "masked-im",
+            onnx="SwinV2OnnxConfig",
+        ),
         "swin2sr": supported_tasks_mapping(
             "feature-extraction",
             "image-to-image",
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 254b771e33..7e53005ed2 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -1682,7 +1682,7 @@ def forward(
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForImageClassification(ORTModel):
     """
-    ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, vit.
+    ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, swinv2, vit.
     """
 
     auto_model_class = AutoModelForImageClassification
diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index ad40af92b9..e4c16ae83a 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -175,6 +175,7 @@ def check_optimization_supported_model(cls, model_type: str, optimization_config
             "clip",
             "vit",
             "swin",
+            "swinv2",
         ]
         model_type = model_type.replace("_", "-")
         if (model_type not in cls._conf) or (cls._conf[model_type] not in supported_model_types_for_optimization):
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 2af51fc183..eec4bb8dd2 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -148,6 +148,7 @@
     "splinter": "hf-internal-testing/tiny-random-SplinterModel",
     "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
+    "swinv2": "hf-internal-testing/tiny-random-Swinv2Model",
     "swin2sr": "hf-internal-testing/tiny-random-Swin2SRModel",
     "t5": "hf-internal-testing/tiny-random-t5",
     "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel",
@@ -268,6 +269,7 @@
     "splinter": "hf-internal-testing/tiny-random-SplinterModel",
     "squeezebert": "squeezebert/squeezebert-uncased",
     "swin": "microsoft/swin-tiny-patch4-window7-224",
+    "swinv2": "microsoft/swinv2-tiny-patch4-window16-256",
     "t5": "t5-small",
     "table-transformer": "microsoft/table-transformer-detection",
     "vit": "google/vit-base-patch16-224",
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 3dc6be1909..3b52194a12 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -144,6 +144,7 @@
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
+    "swinv2": "hf-internal-testing/tiny-random-Swinv2Model",
     "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224",
     "t5": "hf-internal-testing/tiny-random-t5",
     "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel",

From fe140c6f106e1d490d1e9cc3a275130d62d5cae9 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Thu, 29 Aug 2024 16:23:49 +0000
Subject: [PATCH 10/23] Upgrade Siglip to opset=14

---
 optimum/exporters/onnx/model_configs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index b5db3feeff..4d2ced6fc1 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -1087,7 +1087,9 @@ class SiglipNormalizedConfig(CLIPNormalizedConfig):
 
 class SiglipOnnxConfig(CLIPOnnxConfig):
     NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig
-    DEFAULT_ONNX_OPSET = 13
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 13 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:

From 09ae91af4ac324cf1ccb6ea25413e287bda96b70 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Fri, 30 Aug 2024 12:18:16 +0000
Subject: [PATCH 11/23] Add VQA task

---
 optimum/exporters/tasks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index b02e6b392a..27cf9b6ef2 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -223,6 +223,7 @@ class TasksManager:
             "text2text-generation": "AutoModelForSeq2SeqLM",
             "text-classification": "AutoModelForSequenceClassification",
             "token-classification": "AutoModelForTokenClassification",
+            "visual-question-answering": "AutoModelForVisualQuestionAnswering",
             "zero-shot-image-classification": "AutoModelForZeroShotImageClassification",
             "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection",
         }

From 96afc91dcb7a4b157270f80bd2bd0e2bf023b14b Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Fri, 30 Aug 2024 14:35:25 +0000
Subject: [PATCH 12/23] Add ONNX export support for Maskformer

---
 docs/source/exporters/onnx/overview.mdx |  1 +
 optimum/exporters/onnx/model_configs.py | 16 ++++++++++++++++
 optimum/exporters/tasks.py              |  7 ++++++-
 tests/exporters/exporters_utils.py      |  2 ++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 869d481840..164118ba9c 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -65,6 +65,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - M2-M100
 - Marian
 - MarkupLM
+- Maskformer
 - MBart
 - Mistral
 - MobileBert
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 4d2ced6fc1..1c11d1e554 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -895,6 +895,22 @@ class MobileNetV2OnnxConfig(MobileNetV1OnnxConfig):
     pass
 
 
+class MaskformerOnnxConfig(ViTOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::einsum' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 12, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 12
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self.task == "image-segmentation":
+            return {
+                "class_queries_logits": {0: "batch_size", 1: "num_queries"},
+                "masks_queries_logits": {0: "batch_size", 1: "num_queries", 2: "height", 3: "width"},
+            }
+        else:
+            return super().outputs
+
+
 class DonutSwinOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 27cf9b6ef2..192c2dbfb5 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -209,7 +209,7 @@ class TasksManager:
             "feature-extraction": "AutoModel",
             "fill-mask": "AutoModelForMaskedLM",
             "image-classification": "AutoModelForImageClassification",
-            "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"),
+            "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation", "AutoModelForInstanceSegmentation", "AutoModelForUniversalSegmentation"),
             "image-to-image": "AutoModelForImageToImage",
             "image-to-text": "AutoModelForVision2Seq",
             "mask-generation": "AutoModel",
@@ -797,6 +797,11 @@ class TasksManager:
             "question-answering",
             onnx="MarkupLMOnnxConfig",
         ),
+        "maskformer": supported_tasks_mapping(
+            "feature-extraction",
+            "image-segmentation",
+            onnx="MaskformerOnnxConfig",
+        ),
         "mbart": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 5b7719a921..e96e756725 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -115,6 +115,7 @@
     "m2m-100": "hf-internal-testing/tiny-random-m2m_100",
     "marian": "sshleifer/tiny-marian-en-de",  # hf-internal-testing ones are broken
     "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
+    "maskformer": "hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "mistral": "echarlaix/tiny-random-mistral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
@@ -248,6 +249,7 @@
     "m2m-100": "hf-internal-testing/tiny-random-m2m_100",  # Not using facebook/m2m100_418M because it takes too much time for testing.
     "marian": "Helsinki-NLP/opus-mt-en-de",
     "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
+    "maskformer": "facebook/maskformer-swin-tiny-coco",
     "mbart": "sshleifer/tiny-mbart",
     "mobilebert": "google/mobilebert-uncased",
     # "mobilenet_v1": "google/mobilenet_v1_0.75_192",

From 844aa66e0694a97a06fb9a25bfa582ea3f2de481 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Fri, 30 Aug 2024 16:49:19 +0000
Subject: [PATCH 13/23] Add ONNX export support for PVT

---
 docs/source/exporters/onnx/overview.mdx      | 1 +
 optimum/exporters/onnx/model_configs.py      | 4 ++++
 optimum/exporters/tasks.py                   | 5 +++++
 tests/exporters/exporters_utils.py           | 2 ++
 tests/onnxruntime/utils_onnxruntime_tests.py | 1 +
 5 files changed, 13 insertions(+)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 164118ba9c..cf83fbeaba 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -83,6 +83,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - Phi3
 - Pix2Struct
 - PoolFormer
+- PVT
 - Qwen2(Qwen1.5)
 - RegNet
 - ResNet
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 1c11d1e554..47bfed1267 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -776,6 +776,10 @@ class HieraOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
+class PvtOnnxConfig(ViTOnnxConfig):
+    DEFAULT_ONNX_OPSET = 11
+
+
 class Dinov2DummyInputGenerator(DummyVisionInputGenerator):
     def __init__(
         self,
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 192c2dbfb5..2231d66de0 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -974,6 +974,11 @@ class TasksManager:
             "image-classification",
             onnx="PoolFormerOnnxConfig",
         ),
+        "pvt": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            onnx="PvtOnnxConfig",
+        ),
         "regnet": supported_tasks_mapping(
             "feature-extraction",
             "image-classification",
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index e96e756725..5d388715e0 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -140,6 +140,7 @@
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     # "rembert": "google/rembert",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
+    "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification",
     "qwen2": "fxmarty/tiny-dummy-qwen2",
     "regnet": "hf-internal-testing/tiny-random-RegNetModel",
     "resnet": "hf-internal-testing/tiny-random-resnet",
@@ -264,6 +265,7 @@
     "perceiver": "hf-internal-testing/tiny-random-PerceiverModel",  # Not using deepmind/language-perceiver because it takes too much time for testing.
     # "rembert": "google/rembert",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
+    "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification",
     "regnet": "facebook/regnet-y-040",
     "resnet": "microsoft/resnet-50",
     "roberta": "roberta-base",
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 0e8d42fbcc..947db0d8cd 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -133,6 +133,7 @@
     "phi3": "Xenova/tiny-random-Phi3ForCausalLM",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
+    "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification",
     "qwen2": "fxmarty/tiny-dummy-qwen2",
     "resnet": "hf-internal-testing/tiny-random-resnet",
     "roberta": "hf-internal-testing/tiny-random-RobertaModel",

From de07c7aaf512ca45d6635f089dd8763dee77b870 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Fri, 30 Aug 2024 17:26:19 +0000
Subject: [PATCH 14/23] Add ONNX export support for ViTMAE and ViTMSN

---
 docs/source/exporters/onnx/overview.mdx |  2 ++
 optimum/exporters/onnx/model_configs.py | 12 ++++++++++++
 optimum/exporters/tasks.py              | 16 +++++++++++++++-
 tests/exporters/exporters_utils.py      |  2 ++
 4 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index cf83fbeaba..195340bbd0 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -107,6 +107,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - UniSpeech SAT
 - Vision Encoder Decoder
 - Vit
+- VitMAE
+- VitMSN
 - Wav2Vec2
 - Wav2Vec2 Conformer
 - WavLM
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 47bfed1267..aced5b7d7f 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -780,6 +780,18 @@ class PvtOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
+class VitMAEOnnxConfig(ViTOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+
+class VitMSNOnnxConfig(ViTOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+
 class Dinov2DummyInputGenerator(DummyVisionInputGenerator):
     def __init__(
         self,
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 2231d66de0..ab6b9bd1d7 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1135,7 +1135,21 @@ class TasksManager:
             onnx="VisionEncoderDecoderOnnxConfig",
         ),
         "vit": supported_tasks_mapping(
-            "feature-extraction", "image-classification", "masked-im", onnx="ViTOnnxConfig"
+            "feature-extraction",
+            "image-classification",
+            "masked-im",
+            onnx="ViTOnnxConfig",
+        ),
+        "vit-mae": supported_tasks_mapping(
+            "feature-extraction",
+            "masked-im",
+            onnx="VitMAEOnnxConfig",
+        ),
+        "vit-msn": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            "masked-im",
+            onnx="VitMSNOnnxConfig",
         ),
         "vits": supported_tasks_mapping(
             "text-to-audio",
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 5d388715e0..6983a4e029 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -157,6 +157,7 @@
     "t5": "hf-internal-testing/tiny-random-t5",
     "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel",
     "vit": "hf-internal-testing/tiny-random-vit",
+    "vit-msn": "hf-internal-testing/tiny-random-ViTMSNForImageClassification",
     "vits": "echarlaix/tiny-random-vits",
     "yolos": "hf-internal-testing/tiny-random-YolosModel",
     "whisper": "openai/whisper-tiny.en",  # hf-internal-testing ones are broken
@@ -279,6 +280,7 @@
     "t5": "t5-small",
     "table-transformer": "microsoft/table-transformer-detection",
     "vit": "google/vit-base-patch16-224",
+    "vit-msn": "facebook/vit-msn-small",
     "yolos": "hustvl/yolos-tiny",
     "whisper": "openai/whisper-tiny.en",
     "hubert": "facebook/hubert-base-ls960",

From 398d07a918162556129278476b344bb07c4e876e Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Thu, 14 Nov 2024 19:29:09 +0000
Subject: [PATCH 15/23] Add siglip unit tests

---
 optimum/exporters/onnx/model_configs.py      | 6 ++++++
 optimum/exporters/tasks.py                   | 6 ++++++
 tests/exporters/exporters_utils.py           | 4 +++-
 tests/onnxruntime/utils_onnxruntime_tests.py | 1 +
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index aced5b7d7f..1208354db4 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -1140,6 +1140,12 @@ class SiglipTextOnnxConfig(CLIPTextOnnxConfig):
     pass
 
 
+class SiglipVisionModelOnnxConfig(CLIPVisionModelOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+
 class UNetOnnxConfig(VisionOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-3
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index ab6b9bd1d7..af8f272aec 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1051,6 +1051,10 @@ class TasksManager:
             "feature-extraction",
             onnx="SiglipTextWithProjectionOnnxConfig",
         ),
+        "siglip-vision-model": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="SiglipVisionModelOnnxConfig",
+        ),
         "speech-to-text": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
@@ -1229,6 +1233,8 @@ class TasksManager:
         "vae-decoder",
         "clip-text-model",
         "clip-text-with-projection",
+        "siglip-text-model",
+        "siglip-text-with-projection",
         "trocr",  # supported through the vision-encoder-decoder model type
     }
     _SUPPORTED_CLI_MODEL_TYPE = (
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 6983a4e029..30e3110c6b 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -148,7 +148,8 @@
     "roformer": "hf-internal-testing/tiny-random-RoFormerModel",
     "sam": "fxmarty/sam-vit-tiny-random",
     "segformer": "hf-internal-testing/tiny-random-SegformerModel",
-    "siglip": "HuggingFaceM4/tiny-random-siglip",
+    "siglip": "hf-internal-testing/tiny-random-SiglipModel",
+    "siglip-vision-model": "hf-internal-testing/tiny-random-SiglipVisionModel",
     "splinter": "hf-internal-testing/tiny-random-SplinterModel",
     "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
@@ -273,6 +274,7 @@
     "roformer": "junnyu/roformer_chinese_base",
     "sam": "facebook/sam-vit-base",
     "segformer": "nvidia/segformer-b0-finetuned-ade-512-512",
+    "siglip": "google/siglip-base-patch16-224",
     "splinter": "hf-internal-testing/tiny-random-SplinterModel",
     "squeezebert": "squeezebert/squeezebert-uncased",
     "swin": "microsoft/swin-tiny-patch4-window7-224",
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 947db0d8cd..7b5b3ef62a 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -141,6 +141,7 @@
     "segformer": "hf-internal-testing/tiny-random-SegformerModel",
     "sew": "hf-internal-testing/tiny-random-SEWModel",
     "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h",
+    "siglip": "hf-internal-testing/tiny-random-SiglipModel",
     "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel",
     "speech_to_text": "hf-internal-testing/tiny-random-Speech2TextModel",
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",

From 86706d19fe4e4c8871eea743c2299401ad7a5319 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Thu, 14 Nov 2024 19:34:19 +0000
Subject: [PATCH 16/23] Add vit-mae unit tests

---
 optimum/exporters/tasks.py         | 1 +
 tests/exporters/exporters_utils.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index af8f272aec..de287f09e3 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -312,6 +312,7 @@ class TasksManager:
         "lcm": "text-to-image",
         "stable-diffusion": "text-to-image",
         "stable-diffusion-xl": "text-to-image",
+        "pretraining": "feature-extraction",
     }
 
     _CUSTOM_CLASSES = {
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 30e3110c6b..13e1f9e14b 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -158,6 +158,7 @@
     "t5": "hf-internal-testing/tiny-random-t5",
     "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel",
     "vit": "hf-internal-testing/tiny-random-vit",
+    "vit-mae": "hf-internal-testing/tiny-random-ViTMAEModel",
     "vit-msn": "hf-internal-testing/tiny-random-ViTMSNForImageClassification",
     "vits": "echarlaix/tiny-random-vits",
     "yolos": "hf-internal-testing/tiny-random-YolosModel",
@@ -282,6 +283,7 @@
     "t5": "t5-small",
     "table-transformer": "microsoft/table-transformer-detection",
     "vit": "google/vit-base-patch16-224",
+    "vit-mae": "facebook/vit-mae-base",
     "vit-msn": "facebook/vit-msn-small",
     "yolos": "hustvl/yolos-tiny",
     "whisper": "openai/whisper-tiny.en",

From 8ad2e3adfe42883cfb2e3fd15750e42763fa7481 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Thu, 14 Nov 2024 19:58:47 +0000
Subject: [PATCH 17/23] Code formatting

---
 optimum/exporters/tasks.py           | 7 ++++++-
 optimum/onnxruntime/runs/__init__.py | 6 +++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index e4b4b1e445..5382991ff0 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -209,7 +209,12 @@ class TasksManager:
             "feature-extraction": "AutoModel",
             "fill-mask": "AutoModelForMaskedLM",
             "image-classification": "AutoModelForImageClassification",
-            "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation", "AutoModelForInstanceSegmentation", "AutoModelForUniversalSegmentation"),
+            "image-segmentation": (
+                "AutoModelForImageSegmentation",
+                "AutoModelForSemanticSegmentation",
+                "AutoModelForInstanceSegmentation",
+                "AutoModelForUniversalSegmentation",
+            ),
             "image-to-image": "AutoModelForImageToImage",
             "image-to-text": "AutoModelForVision2Seq",
             "mask-generation": "AutoModel",
diff --git a/optimum/onnxruntime/runs/__init__.py b/optimum/onnxruntime/runs/__init__.py
index 1d98294934..d21db2a4ac 100644
--- a/optimum/onnxruntime/runs/__init__.py
+++ b/optimum/onnxruntime/runs/__init__.py
@@ -110,9 +110,9 @@ def __init__(self, run_config):
         model_class = FeaturesManager.get_model_class_for_feature(get_autoclass_name(self.task))
         self.torch_model = model_class.from_pretrained(run_config["model_name_or_path"])
 
-        self.return_body[
-            "model_type"
-        ] = self.torch_model.config.model_type  # return_body is initialized in parent class
+        self.return_body["model_type"] = (
+            self.torch_model.config.model_type
+        )  # return_body is initialized in parent class
 
     def _launch_time(self, trial):
         batch_size = trial.suggest_categorical("batch_size", self.batch_sizes)

From 55a19cb724f515b860b865b5c3390a7e9dc5ba9f Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Thu, 14 Nov 2024 19:59:10 +0000
Subject: [PATCH 18/23] Add maskformer to list of supported models

---
 optimum/onnxruntime/modeling_ort.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index f1f7b2fe37..a55eb064fa 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -1784,7 +1784,7 @@ def forward(
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForSemanticSegmentation(ORTModel):
     """
-    ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports segformer.
+    ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports maskformer, segformer.
     """
 
     auto_model_class = AutoModelForSemanticSegmentation

From fd15bd3e0dbcd6bb6950c46cdd69c34a53975e50 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Thu, 14 Nov 2024 20:05:03 +0000
Subject: [PATCH 19/23] Formatting

---
 optimum/onnxruntime/runs/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/onnxruntime/runs/__init__.py b/optimum/onnxruntime/runs/__init__.py
index d21db2a4ac..1d98294934 100644
--- a/optimum/onnxruntime/runs/__init__.py
+++ b/optimum/onnxruntime/runs/__init__.py
@@ -110,9 +110,9 @@ def __init__(self, run_config):
         model_class = FeaturesManager.get_model_class_for_feature(get_autoclass_name(self.task))
         self.torch_model = model_class.from_pretrained(run_config["model_name_or_path"])
 
-        self.return_body["model_type"] = (
-            self.torch_model.config.model_type
-        )  # return_body is initialized in parent class
+        self.return_body[
+            "model_type"
+        ] = self.torch_model.config.model_type  # return_body is initialized in parent class
 
     def _launch_time(self, trial):
         batch_size = trial.suggest_categorical("batch_size", self.batch_sizes)

From 7f0cb92ced010bae2ad448916bf620c3b965da5a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 18 Dec 2024 17:20:43 +0100
Subject: [PATCH 20/23] fix typo

---
 docs/source/exporters/onnx/overview.mdx | 2 +-
 optimum/exporters/onnx/model_configs.py | 2 +-
 optimum/exporters/tasks.py              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index a434094046..b5129c23f2 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -66,7 +66,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - M2-M100
 - Marian
 - MarkupLM
-- Maskformer
+- MaskFormer
 - MBart
 - MGP-STR
 - Mistral
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 5c3269e193..771c9de70f 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -986,7 +986,7 @@ class MobileNetV2OnnxConfig(MobileNetV1OnnxConfig):
     pass
 
 
-class MaskformerOnnxConfig(ViTOnnxConfig):
+class MaskFormerOnnxConfig(ViTOnnxConfig):
     # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::einsum' to ONNX opset version 11 is not supported.
     # Support for this operator was added in version 12, try exporting with this version.
     DEFAULT_ONNX_OPSET = 12
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index ab48154dda..bd8fd1de15 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -833,7 +833,7 @@ class TasksManager:
         "maskformer": supported_tasks_mapping(
             "feature-extraction",
             "image-segmentation",
-            onnx="MaskformerOnnxConfig",
+            onnx="MaskFormerOnnxConfig",
         ),
         "mbart": supported_tasks_mapping(
             "feature-extraction",

From 7a2e94a12036a8e8d9172fbf4638f6c976d13600 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 18 Dec 2024 17:21:48 +0100
Subject: [PATCH 21/23] remove vit-mae masked-im task

---
 optimum/exporters/tasks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index bd8fd1de15..b09ec20a8c 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1205,7 +1205,6 @@ class TasksManager:
         ),
         "vit-mae": supported_tasks_mapping(
             "feature-extraction",
-            "masked-im",
             onnx="VitMAEOnnxConfig",
         ),
         "vit-msn": supported_tasks_mapping(

From 01929b2e8773bfe11791e12f6ebbf7ce5db3c8a9 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 18 Dec 2024 17:23:13 +0100
Subject: [PATCH 22/23] remove vit-msn masked-im task

---
 optimum/exporters/tasks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index b09ec20a8c..7cb5a31d2d 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1210,7 +1210,6 @@ class TasksManager:
         "vit-msn": supported_tasks_mapping(
             "feature-extraction",
             "image-classification",
-            "masked-im",
             onnx="VitMSNOnnxConfig",
         ),
         "vits": supported_tasks_mapping(

From 3fa346c21f09c9333663130c784a8f813fa3ac8c Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Wed, 18 Dec 2024 17:37:28 +0100
Subject: [PATCH 23/23] fix output names for maskformer export

---
 optimum/exporters/onnx/model_configs.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 771c9de70f..4c5a727a18 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -1001,6 +1001,12 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         else:
             return super().outputs
 
+    @property
+    def torch_to_onnx_output_map(self) -> Dict[str, str]:
+        return {
+            "transformer_decoder_last_hidden_state": "last_hidden_state",
+        }
+
 
 class DonutSwinOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11