Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
360 changes: 180 additions & 180 deletions src/transformers/models/auto/auto_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,210 +628,210 @@

SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
[
("nllb-moe", "nllb_moe"),
("dab-detr", "dab_detr"),
("xlm-roberta-xl", "xlm_roberta_xl"),
("aimv2_text_model", "aimv2"),
("aimv2_vision_model", "aimv2"),
Comment on lines -631 to +632
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just sorted, was never being done in the past. Sorting makes it deterministic in all machines

("align_text_model", "align"),
("align_vision_model", "align"),
("altclip_text_model", "altclip"),
("altclip_vision_model", "altclip"),
("aria_text", "aria"),
("audio-spectrogram-transformer", "audio_spectrogram_transformer"),
("megatron-bert", "megatron_bert"),
("clipseg_text_model", "clipseg"),
("clipseg_vision_model", "clipseg"),
("t5gemma2_text", "t5gemma2"),
("t5gemma2_encoder", "t5gemma2"),
("t5gemma2_decoder", "t5gemma2"),
("llama4_vision_model", "llama4"),
("llama4_text", "llama4"),
("pe_video_encoder", "pe_video"),
("audioflamingo3_encoder", "audioflamingo3"),
("bert-generation", "bert_generation"),
("blenderbot-small", "blenderbot_small"),
("blip-2", "blip_2"),
("blip_2_qformer", "blip_2"),
("blip_2_vision_model", "blip_2"),
("blip_text_model", "blip"),
("blip_vision_model", "blip"),
("blt_global_transformer", "blt"),
("blt_local_decoder", "blt"),
("blt_local_encoder", "blt"),
("blt_patcher", "blt"),
("bridgetower_text_model", "bridgetower"),
("bridgetower_vision_model", "bridgetower"),
("chameleon_vqgan", "chameleon"),
("aimv2_vision_model", "aimv2"),
("aimv2_text_model", "aimv2"),
("siglip2_text_model", "siglip2"),
("siglip2_vision_model", "siglip2"),
("chinese_clip_text_model", "chinese_clip"),
("chinese_clip_vision_model", "chinese_clip"),
("clap_audio_model", "clap"),
("clap_text_model", "clap"),
("clip_text_model", "clip"),
("clip_vision_model", "clip"),
("clipseg_text_model", "clipseg"),
("clipseg_vision_model", "clipseg"),
("clvp_decoder", "clvp"),
("clvp_encoder", "clvp"),
("csm_depth_decoder_model", "csm"),
("dab-detr", "dab_detr"),
("data2vec-audio", "data2vec"),
("data2vec-text", "data2vec"),
("data2vec-vision", "data2vec"),
("deberta-v2", "deberta_v2"),
("detr", "maskformer"),
("dia_decoder", "dia"),
("dia_encoder", "dia"),
("donut-swin", "donut"),
("edgetam_vision_model", "edgetam"),
("emu3_text_model", "emu3"),
("emu3_vqgan", "emu3"),
("encoder-decoder", "encoder_decoder"),
("ernie4_5_vl_moe_text", "ernie4_5_vl_moe"),
("ernie4_5_vl_moe_vision", "ernie4_5_vl_moe"),
("fastspeech2_conformer_hifigan", "fastspeech2_conformer"),
("fastspeech2_conformer_with_hifigan", "fastspeech2_conformer"),
("rt_detr_resnet", "rt_detr"),
("qwen2_5_vl_vision", "qwen2_5_vl"),
("qwen2_5_vl_text", "qwen2_5_vl"),
("glm4v_vision", "glm4v"),
("glm4v_text", "glm4v"),
("moshi_depth", "moshi"),
("qwen3_5_text", "qwen3_5"),
("qwen3_5_vision", "qwen3_5"),
("flava_image_model", "flava"),
("flava_multimodal_model", "flava"),
("flava_text_model", "flava"),
("florence_vision", "florence2"),
("gemma3_text", "gemma3"),
("glmasr_encoder", "glmasr"),
("sam3_vit_model", "sam3"),
("sam3_vision_model", "sam3"),
("sam3_lite_text_geometry_encoder", "sam3_lite_text"),
("sam3_lite_text_detr_encoder", "sam3_lite_text"),
("sam3_lite_text_detr_decoder", "sam3_lite_text"),
("sam3_lite_text_mask_decoder", "sam3_lite_text"),
("sam3_lite_text_text_model", "sam3_lite_text"),
("detr", "maskformer"),
("maskformer-swin", "maskformer"),
("roberta-prelayernorm", "roberta_prelayernorm"),
("glm_image_vqmodel", "glm_image"),
("glm_image_vision", "glm_image"),
("glm_image_text", "glm_image"),
("pix2struct_text_model", "pix2struct"),
("pix2struct_vision_model", "pix2struct"),
("clvp_encoder", "clvp"),
("clvp_decoder", "clvp"),
("idefics_vision", "idefics"),
("idefics_perciever", "idefics"),
("gemma3n_text", "gemma3n"),
("gemma3n_audio", "gemma3n"),
("gemma3n_text", "gemma3n"),
("gemma3n_vision", "gemma3n"),
("florence_vision", "florence2"),
("bert-generation", "bert_generation"),
("clap_text_model", "clap"),
("clap_audio_model", "clap"),
("vision-text-dual-encoder", "vision_text_dual_encoder"),
("dia_encoder", "dia"),
("dia_decoder", "dia"),
("phi4_multimodal_vision", "phi4_multimodal"),
("phi4_multimodal_audio", "phi4_multimodal"),
("metaclip_2_text_model", "metaclip_2"),
("metaclip_2_vision_model", "metaclip_2"),
("sam_hq_vision_model", "sam_hq"),
("voxtral_realtime_text", "voxtral_realtime"),
("voxtral_realtime_encoder", "voxtral_realtime"),
("gemma4_audio", "gemma4"),
("gemma4_text", "gemma4"),
("gemma4_vision", "gemma4"),
("git_vision_model", "git"),
("glm4v_moe_text", "glm4v_moe"),
("glm4v_moe_vision", "glm4v_moe"),
("glm4v_text", "glm4v"),
("glm4v_vision", "glm4v"),
("glm_image_text", "glm_image"),
("glm_image_vision", "glm_image"),
("glm_image_vqmodel", "glm_image"),
("glm_ocr_text", "glm_ocr"),
("glm_ocr_vision", "glm_ocr"),
("glmasr_encoder", "glmasr"),
("granite_speech_encoder", "granite_speech"),
("grounding-dino", "grounding_dino"),
("groupvit_text_model", "groupvit"),
("groupvit_vision_model", "groupvit"),
("idefics2_perceiver", "idefics2"),
("idefics2_vision", "idefics2"),
("idefics3_vision", "idefics3"),
("idefics_perciever", "idefics"),
("idefics_vision", "idefics"),
("instructblip_qformer", "instructblip"),
("instructblip_vision_model", "instructblip"),
("instructblipvideo_qformer", "instructblipvideo"),
("instructblipvideo_vision_model", "instructblipvideo"),
("internvl_vision", "internvl"),
("janus_vision_model", "janus"),
("janus_vqgan", "janus"),
("kosmos-2", "kosmos2"),
("kosmos-2.5", "kosmos2_5"),
("kosmos_2_5_text_model", "kosmos2_5"),
("kosmos_2_5_vision_model", "kosmos2_5"),
("kosmos_2_text_model", "kosmos2"),
("kosmos_2_vision_model", "kosmos2"),
("kosmos-2", "kosmos2"),
("voxtral_encoder", "voxtral"),
("grounding-dino", "grounding_dino"),
("t5_gemma_module", "t5gemma"),
("bridgetower_vision_model", "bridgetower"),
("bridgetower_text_model", "bridgetower"),
("qwen2_5_omni_vision_encoder", "qwen2_5_omni"),
("lasr_ctc", "lasr"),
("lasr_encoder", "lasr"),
("llama4_text", "llama4"),
("llama4_vision_model", "llama4"),
("lw_detr_vit", "lw_detr"),
("maskformer-swin", "maskformer"),
("megatron-bert", "megatron_bert"),
("metaclip_2_text_model", "metaclip_2"),
("metaclip_2_vision_model", "metaclip_2"),
("mgp-str", "mgp_str"),
("mlcd_vision_model", "mlcd"),
("mllama_text_model", "mllama"),
("mllama_vision_model", "mllama"),
("mm-grounding-dino", "mm_grounding_dino"),
("modernbert-decoder", "modernbert_decoder"),
("moonshine_streaming_encoder", "moonshine_streaming"),
("moshi_depth", "moshi"),
("musicgen_decoder", "musicgen"),
("musicgen_melody_decoder", "musicgen_melody"),
("nllb-moe", "nllb_moe"),
("omdet-turbo", "omdet_turbo"),
("openai-gpt", "openai"),
("owlv2_text_model", "owlv2"),
("owlv2_vision_model", "owlv2"),
("owlvit_text_model", "owlvit"),
("owlvit_vision_model", "owlvit"),
("paddleocr_vl_text", "paddleocr_vl"),
("paddleocr_vl_vision", "paddleocr_vl"),
("parakeet_ctc", "parakeet"),
("parakeet_encoder", "parakeet"),
("pe_audio_encoder", "pe_audio"),
("pe_audio_video_encoder", "pe_audio_video"),
("pe_video_encoder", "pe_video"),
("phi4_multimodal_audio", "phi4_multimodal"),
("phi4_multimodal_vision", "phi4_multimodal"),
("pix2struct_text_model", "pix2struct"),
("pix2struct_vision_model", "pix2struct"),
("qwen2_5_omni_audio_encoder", "qwen2_5_omni"),
("qwen2_5_omni_bigvgan", "qwen2_5_omni"),
("qwen2_5_omni_dit", "qwen2_5_omni"),
("qwen2_5_omni_talker", "qwen2_5_omni"),
("qwen2_5_omni_text", "qwen2_5_omni"),
("qwen2_5_omni_thinker", "qwen2_5_omni"),
("qwen2_5_omni_talker", "qwen2_5_omni"),
("qwen2_5_omni_dit", "qwen2_5_omni"),
("qwen2_5_omni_bigvgan", "qwen2_5_omni"),
("qwen2_5_omni_token2wav", "qwen2_5_omni"),
("speech-encoder-decoder", "speech_encoder_decoder"),
("pe_audio_video_encoder", "pe_audio_video"),
("kosmos_2_5_text_model", "kosmos2_5"),
("kosmos_2_5_vision_model", "kosmos2_5"),
("kosmos-2.5", "kosmos2_5"),
("altclip_text_model", "altclip"),
("altclip_vision_model", "altclip"),
("speecht5_hifigan", "speecht5"),
("pe_audio_encoder", "pe_audio"),
("smolvlm_vision", "smolvlm"),
("blt_local_encoder", "blt"),
("blt_local_decoder", "blt"),
("blt_global_transformer", "blt"),
("blt_patcher", "blt"),
("uvdoc_backbone", "uvdoc"),
("qwen2_5_omni_vision_encoder", "qwen2_5_omni"),
("qwen2_5_vl_text", "qwen2_5_vl"),
("qwen2_5_vl_vision", "qwen2_5_vl"),
("qwen2_audio_encoder", "qwen2_audio"),
("vision-encoder-decoder", "vision_encoder_decoder"),
("edgetam_vision_model", "edgetam"),
("qwen2_vl_vision", "qwen2_vl"),
("qwen2_vl_text", "qwen2_vl"),
("omdet-turbo", "omdet_turbo"),
("janus_vision_model", "janus"),
("janus_vqgan", "janus"),
("musicgen_decoder", "musicgen"),
("sam_vision_model", "sam"),
("data2vec-vision", "data2vec"),
("data2vec-audio", "data2vec"),
("data2vec-text", "data2vec"),
("idefics2_vision", "idefics2"),
("idefics2_perceiver", "idefics2"),
("aria_text", "aria"),
("qwen3_vl_moe_text", "qwen3_vl_moe"),
("qwen3_vl_moe_vision", "qwen3_vl_moe"),
("blenderbot-small", "blenderbot_small"),
("ernie4_5_vl_moe_vision", "ernie4_5_vl_moe"),
("ernie4_5_vl_moe_text", "ernie4_5_vl_moe"),
("siglip_text_model", "siglip"),
("siglip_vision_model", "siglip"),
("flava_image_model", "flava"),
("flava_text_model", "flava"),
("flava_multimodal_model", "flava"),
("modernbert-decoder", "modernbert_decoder"),
("lasr_encoder", "lasr"),
("lasr_ctc", "lasr"),
("instructblip_vision_model", "instructblip"),
("instructblip_qformer", "instructblip"),
("internvl_vision", "internvl"),
("qwen2_vl_vision", "qwen2_vl"),
("qwen3_5_moe_text", "qwen3_5_moe"),
("qwen3_5_moe_vision", "qwen3_5_moe"),
("qwen3_5_text", "qwen3_5"),
("qwen3_5_vision", "qwen3_5"),
("qwen3_omni_moe_audio_encoder", "qwen3_omni_moe"),
("qwen3_omni_moe_vision_encoder", "qwen3_omni_moe"),
("qwen3_omni_moe_text", "qwen3_omni_moe"),
("qwen3_omni_moe_thinker", "qwen3_omni_moe"),
("qwen3_omni_moe_talker_code_predictor", "qwen3_omni_moe"),
("qwen3_omni_moe_talker_text", "qwen3_omni_moe"),
("xclip_text_model", "x_clip"),
("xclip_vision_model", "x_clip"),
("xclip", "x_clip"),
("audioflamingo3_encoder", "audioflamingo3"),
("wav2vec2-bert", "wav2vec2_bert"),
("encoder-decoder", "encoder_decoder"),
("instructblipvideo_vision_model", "instructblipvideo"),
("instructblipvideo_qformer", "instructblipvideo"),
("emu3_vqgan", "emu3"),
("emu3_text_model", "emu3"),
("chinese_clip_text_model", "chinese_clip"),
("chinese_clip_vision_model", "chinese_clip"),
("owlv2_text_model", "owlv2"),
("owlv2_vision_model", "owlv2"),
("qwen3_5_moe_text", "qwen3_5_moe"),
("qwen3_5_moe_vision", "qwen3_5_moe"),
("lw_detr_vit", "lw_detr"),
("donut-swin", "donut"),
("moonshine_streaming_encoder", "moonshine_streaming"),
("owlvit_text_model", "owlvit"),
("owlvit_vision_model", "owlvit"),
("glm_ocr_vision", "glm_ocr"),
("glm_ocr_text", "glm_ocr"),
("xlm-roberta", "xlm_roberta"),
("unispeech-sat", "unispeech_sat"),
("wav2vec2-conformer", "wav2vec2_conformer"),
("blip_text_model", "blip"),
("blip_vision_model", "blip"),
("sam3_geometry_encoder", "sam3"),
("sam3_detr_encoder", "sam3"),
("sam3_detr_decoder", "sam3"),
("sam3_mask_decoder", "sam3"),
("granite_speech_encoder", "granite_speech"),
("deberta-v2", "deberta_v2"),
("qwen3_omni_moe_text", "qwen3_omni_moe"),
("qwen3_omni_moe_thinker", "qwen3_omni_moe"),
("qwen3_omni_moe_vision_encoder", "qwen3_omni_moe"),
("qwen3_vl_moe_text", "qwen3_vl_moe"),
("qwen3_vl_moe_vision", "qwen3_vl_moe"),
("qwen3_vl_text", "qwen3_vl"),
("qwen3_vl_vision", "qwen3_vl"),
("roberta-prelayernorm", "roberta_prelayernorm"),
("rt_detr_resnet", "rt_detr"),
("sam2_hiera_det_model", "sam2"),
("sam2_vision_model", "sam2"),
("openai-gpt", "openai"),
("csm_depth_decoder_model", "csm"),
("align_text_model", "align"),
("align_vision_model", "align"),
("groupvit_text_model", "groupvit"),
("groupvit_vision_model", "groupvit"),
("mgp-str", "mgp_str"),
("mm-grounding-dino", "mm_grounding_dino"),
("git_vision_model", "git"),
("musicgen_melody_decoder", "musicgen_melody"),
("mllama_vision_model", "mllama"),
("mllama_text_model", "mllama"),
("mlcd_vision_model", "mlcd"),
("sam3_detr_decoder", "sam3"),
("sam3_detr_encoder", "sam3"),
("sam3_geometry_encoder", "sam3"),
("sam3_lite_text_detr_decoder", "sam3_lite_text"),
("sam3_lite_text_detr_encoder", "sam3_lite_text"),
("sam3_lite_text_geometry_encoder", "sam3_lite_text"),
("sam3_lite_text_mask_decoder", "sam3_lite_text"),
("sam3_lite_text_text_model", "sam3_lite_text"),
("sam3_mask_decoder", "sam3"),
("sam3_vision_model", "sam3"),
("sam3_vit_model", "sam3"),
("sam_hq_vision_model", "sam_hq"),
("sam_vision_model", "sam"),
("sew-d", "sew_d"),
("video_llama_3_vision", "video_llama_3"),
("idefics3_vision", "idefics3"),
("gemma4_audio", "gemma4"),
("gemma4_text", "gemma4"),
("gemma4_vision", "gemma4"),
("qwen3_vl_vision", "qwen3_vl"),
("qwen3_vl_text", "qwen3_vl"),
("blip_2_vision_model", "blip_2"),
("blip_2_qformer", "blip_2"),
("blip-2", "blip_2"),
("parakeet_encoder", "parakeet"),
("parakeet_ctc", "parakeet"),
("siglip2_text_model", "siglip2"),
("siglip2_vision_model", "siglip2"),
("siglip_text_model", "siglip"),
("siglip_vision_model", "siglip"),
("smolvlm_vision", "smolvlm"),
("speech-encoder-decoder", "speech_encoder_decoder"),
("speecht5_hifigan", "speecht5"),
("t5_gemma_module", "t5gemma"),
("t5gemma2_decoder", "t5gemma2"),
("t5gemma2_encoder", "t5gemma2"),
("t5gemma2_text", "t5gemma2"),
("table-transformer", "table_transformer"),
("glm4v_moe_text", "glm4v_moe"),
("glm4v_moe_vision", "glm4v_moe"),
("paddleocr_vl_vision", "paddleocr_vl"),
("paddleocr_vl_text", "paddleocr_vl"),
("unispeech-sat", "unispeech_sat"),
("uvdoc_backbone", "uvdoc"),
("video_llama_3_vision", "video_llama_3"),
("vision-encoder-decoder", "vision_encoder_decoder"),
("vision-text-dual-encoder", "vision_text_dual_encoder"),
("voxtral_encoder", "voxtral"),
("voxtral_realtime_encoder", "voxtral_realtime"),
("voxtral_realtime_text", "voxtral_realtime"),
("wav2vec2-bert", "wav2vec2_bert"),
("wav2vec2-conformer", "wav2vec2_conformer"),
("xclip", "x_clip"),
("xclip_text_model", "x_clip"),
("xclip_vision_model", "x_clip"),
("xlm-roberta", "xlm_roberta"),
("xlm-roberta-xl", "xlm_roberta_xl"),
]
)

Expand Down
Loading
Loading