diff --git a/src/transformers/models/auto/auto_mappings.py b/src/transformers/models/auto/auto_mappings.py index 98c40e5a891b..0ec3cdf700ec 100644 --- a/src/transformers/models/auto/auto_mappings.py +++ b/src/transformers/models/auto/auto_mappings.py @@ -628,210 +628,210 @@ SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict( [ - ("nllb-moe", "nllb_moe"), - ("dab-detr", "dab_detr"), - ("xlm-roberta-xl", "xlm_roberta_xl"), + ("aimv2_text_model", "aimv2"), + ("aimv2_vision_model", "aimv2"), + ("align_text_model", "align"), + ("align_vision_model", "align"), + ("altclip_text_model", "altclip"), + ("altclip_vision_model", "altclip"), + ("aria_text", "aria"), ("audio-spectrogram-transformer", "audio_spectrogram_transformer"), - ("megatron-bert", "megatron_bert"), - ("clipseg_text_model", "clipseg"), - ("clipseg_vision_model", "clipseg"), - ("t5gemma2_text", "t5gemma2"), - ("t5gemma2_encoder", "t5gemma2"), - ("t5gemma2_decoder", "t5gemma2"), - ("llama4_vision_model", "llama4"), - ("llama4_text", "llama4"), - ("pe_video_encoder", "pe_video"), + ("audioflamingo3_encoder", "audioflamingo3"), + ("bert-generation", "bert_generation"), + ("blenderbot-small", "blenderbot_small"), + ("blip-2", "blip_2"), + ("blip_2_qformer", "blip_2"), + ("blip_2_vision_model", "blip_2"), + ("blip_text_model", "blip"), + ("blip_vision_model", "blip"), + ("blt_global_transformer", "blt"), + ("blt_local_decoder", "blt"), + ("blt_local_encoder", "blt"), + ("blt_patcher", "blt"), + ("bridgetower_text_model", "bridgetower"), + ("bridgetower_vision_model", "bridgetower"), ("chameleon_vqgan", "chameleon"), - ("aimv2_vision_model", "aimv2"), - ("aimv2_text_model", "aimv2"), - ("siglip2_text_model", "siglip2"), - ("siglip2_vision_model", "siglip2"), + ("chinese_clip_text_model", "chinese_clip"), + ("chinese_clip_vision_model", "chinese_clip"), + ("clap_audio_model", "clap"), + ("clap_text_model", "clap"), ("clip_text_model", "clip"), ("clip_vision_model", "clip"), + ("clipseg_text_model", "clipseg"), + ("clipseg_vision_model", "clipseg"), + ("clvp_decoder", "clvp"), + ("clvp_encoder", "clvp"), + ("csm_depth_decoder_model", "csm"), + ("dab-detr", "dab_detr"), + ("data2vec-audio", "data2vec"), + ("data2vec-text", "data2vec"), + ("data2vec-vision", "data2vec"), + ("deberta-v2", "deberta_v2"), + ("detr", "maskformer"), + ("dia_decoder", "dia"), + ("dia_encoder", "dia"), + ("donut-swin", "donut"), + ("edgetam_vision_model", "edgetam"), + ("emu3_text_model", "emu3"), + ("emu3_vqgan", "emu3"), + ("encoder-decoder", "encoder_decoder"), + ("ernie4_5_vl_moe_text", "ernie4_5_vl_moe"), + ("ernie4_5_vl_moe_vision", "ernie4_5_vl_moe"), ("fastspeech2_conformer_hifigan", "fastspeech2_conformer"), ("fastspeech2_conformer_with_hifigan", "fastspeech2_conformer"), - ("rt_detr_resnet", "rt_detr"), - ("qwen2_5_vl_vision", "qwen2_5_vl"), - ("qwen2_5_vl_text", "qwen2_5_vl"), - ("glm4v_vision", "glm4v"), - ("glm4v_text", "glm4v"), - ("moshi_depth", "moshi"), - ("qwen3_5_text", "qwen3_5"), - ("qwen3_5_vision", "qwen3_5"), + ("flava_image_model", "flava"), + ("flava_multimodal_model", "flava"), + ("flava_text_model", "flava"), + ("florence_vision", "florence2"), ("gemma3_text", "gemma3"), - ("glmasr_encoder", "glmasr"), - ("sam3_vit_model", "sam3"), - ("sam3_vision_model", "sam3"), - ("sam3_lite_text_geometry_encoder", "sam3_lite_text"), - ("sam3_lite_text_detr_encoder", "sam3_lite_text"), - ("sam3_lite_text_detr_decoder", "sam3_lite_text"), - ("sam3_lite_text_mask_decoder", "sam3_lite_text"), - ("sam3_lite_text_text_model", "sam3_lite_text"), - ("detr", "maskformer"), - ("maskformer-swin", "maskformer"), - ("roberta-prelayernorm", "roberta_prelayernorm"), - ("glm_image_vqmodel", "glm_image"), - ("glm_image_vision", "glm_image"), - ("glm_image_text", "glm_image"), - ("pix2struct_text_model", "pix2struct"), - ("pix2struct_vision_model", "pix2struct"), - ("clvp_encoder", "clvp"), - ("clvp_decoder", "clvp"), - ("idefics_vision", "idefics"), - ("idefics_perciever", "idefics"), - ("gemma3n_text", "gemma3n"), ("gemma3n_audio", "gemma3n"), + ("gemma3n_text", "gemma3n"), ("gemma3n_vision", "gemma3n"), - ("florence_vision", "florence2"), - ("bert-generation", "bert_generation"), - ("clap_text_model", "clap"), - ("clap_audio_model", "clap"), - ("vision-text-dual-encoder", "vision_text_dual_encoder"), - ("dia_encoder", "dia"), - ("dia_decoder", "dia"), - ("phi4_multimodal_vision", "phi4_multimodal"), - ("phi4_multimodal_audio", "phi4_multimodal"), - ("metaclip_2_text_model", "metaclip_2"), - ("metaclip_2_vision_model", "metaclip_2"), - ("sam_hq_vision_model", "sam_hq"), - ("voxtral_realtime_text", "voxtral_realtime"), - ("voxtral_realtime_encoder", "voxtral_realtime"), + ("gemma4_audio", "gemma4"), + ("gemma4_text", "gemma4"), + ("gemma4_vision", "gemma4"), + ("git_vision_model", "git"), + ("glm4v_moe_text", "glm4v_moe"), + ("glm4v_moe_vision", "glm4v_moe"), + ("glm4v_text", "glm4v"), + ("glm4v_vision", "glm4v"), + ("glm_image_text", "glm_image"), + ("glm_image_vision", "glm_image"), + ("glm_image_vqmodel", "glm_image"), + ("glm_ocr_text", "glm_ocr"), + ("glm_ocr_vision", "glm_ocr"), + ("glmasr_encoder", "glmasr"), + ("granite_speech_encoder", "granite_speech"), + ("grounding-dino", "grounding_dino"), + ("groupvit_text_model", "groupvit"), + ("groupvit_vision_model", "groupvit"), + ("idefics2_perceiver", "idefics2"), + ("idefics2_vision", "idefics2"), + ("idefics3_vision", "idefics3"), + ("idefics_perciever", "idefics"), + ("idefics_vision", "idefics"), + ("instructblip_qformer", "instructblip"), + ("instructblip_vision_model", "instructblip"), + ("instructblipvideo_qformer", "instructblipvideo"), + ("instructblipvideo_vision_model", "instructblipvideo"), + ("internvl_vision", "internvl"), + ("janus_vision_model", "janus"), + ("janus_vqgan", "janus"), + ("kosmos-2", "kosmos2"), + ("kosmos-2.5", "kosmos2_5"), + ("kosmos_2_5_text_model", "kosmos2_5"), + ("kosmos_2_5_vision_model", "kosmos2_5"), ("kosmos_2_text_model", "kosmos2"), ("kosmos_2_vision_model", "kosmos2"), - ("kosmos-2", "kosmos2"), - ("voxtral_encoder", "voxtral"), - ("grounding-dino", "grounding_dino"), - ("t5_gemma_module", "t5gemma"), - ("bridgetower_vision_model", "bridgetower"), - ("bridgetower_text_model", "bridgetower"), - ("qwen2_5_omni_vision_encoder", "qwen2_5_omni"), + ("lasr_ctc", "lasr"), + ("lasr_encoder", "lasr"), + ("llama4_text", "llama4"), + ("llama4_vision_model", "llama4"), + ("lw_detr_vit", "lw_detr"), + ("maskformer-swin", "maskformer"), + ("megatron-bert", "megatron_bert"), + ("metaclip_2_text_model", "metaclip_2"), + ("metaclip_2_vision_model", "metaclip_2"), + ("mgp-str", "mgp_str"), + ("mlcd_vision_model", "mlcd"), + ("mllama_text_model", "mllama"), + ("mllama_vision_model", "mllama"), + ("mm-grounding-dino", "mm_grounding_dino"), + ("modernbert-decoder", "modernbert_decoder"), + ("moonshine_streaming_encoder", "moonshine_streaming"), + ("moshi_depth", "moshi"), + ("musicgen_decoder", "musicgen"), + ("musicgen_melody_decoder", "musicgen_melody"), + ("nllb-moe", "nllb_moe"), + ("omdet-turbo", "omdet_turbo"), + ("openai-gpt", "openai"), + ("owlv2_text_model", "owlv2"), + ("owlv2_vision_model", "owlv2"), + ("owlvit_text_model", "owlvit"), + ("owlvit_vision_model", "owlvit"), + ("paddleocr_vl_text", "paddleocr_vl"), + ("paddleocr_vl_vision", "paddleocr_vl"), + ("parakeet_ctc", "parakeet"), + ("parakeet_encoder", "parakeet"), + ("pe_audio_encoder", "pe_audio"), + ("pe_audio_video_encoder", "pe_audio_video"), + ("pe_video_encoder", "pe_video"), + ("phi4_multimodal_audio", "phi4_multimodal"), + ("phi4_multimodal_vision", "phi4_multimodal"), + ("pix2struct_text_model", "pix2struct"), + ("pix2struct_vision_model", "pix2struct"), ("qwen2_5_omni_audio_encoder", "qwen2_5_omni"), + ("qwen2_5_omni_bigvgan", "qwen2_5_omni"), + ("qwen2_5_omni_dit", "qwen2_5_omni"), + ("qwen2_5_omni_talker", "qwen2_5_omni"), ("qwen2_5_omni_text", "qwen2_5_omni"), ("qwen2_5_omni_thinker", "qwen2_5_omni"), - ("qwen2_5_omni_talker", "qwen2_5_omni"), - ("qwen2_5_omni_dit", "qwen2_5_omni"), - ("qwen2_5_omni_bigvgan", "qwen2_5_omni"), ("qwen2_5_omni_token2wav", "qwen2_5_omni"), - ("speech-encoder-decoder", "speech_encoder_decoder"), - ("pe_audio_video_encoder", "pe_audio_video"), - ("kosmos_2_5_text_model", "kosmos2_5"), - ("kosmos_2_5_vision_model", "kosmos2_5"), - ("kosmos-2.5", "kosmos2_5"), - ("altclip_text_model", "altclip"), - ("altclip_vision_model", "altclip"), - ("speecht5_hifigan", "speecht5"), - ("pe_audio_encoder", "pe_audio"), - ("smolvlm_vision", "smolvlm"), - ("blt_local_encoder", "blt"), - ("blt_local_decoder", "blt"), - ("blt_global_transformer", "blt"), - ("blt_patcher", "blt"), - ("uvdoc_backbone", "uvdoc"), + ("qwen2_5_omni_vision_encoder", "qwen2_5_omni"), + ("qwen2_5_vl_text", "qwen2_5_vl"), + ("qwen2_5_vl_vision", "qwen2_5_vl"), ("qwen2_audio_encoder", "qwen2_audio"), - ("vision-encoder-decoder", "vision_encoder_decoder"), - ("edgetam_vision_model", "edgetam"), - ("qwen2_vl_vision", "qwen2_vl"), ("qwen2_vl_text", "qwen2_vl"), - ("omdet-turbo", "omdet_turbo"), - ("janus_vision_model", "janus"), - ("janus_vqgan", "janus"), - ("musicgen_decoder", "musicgen"), - ("sam_vision_model", "sam"), - ("data2vec-vision", "data2vec"), - ("data2vec-audio", "data2vec"), - ("data2vec-text", "data2vec"), - ("idefics2_vision", "idefics2"), - ("idefics2_perceiver", "idefics2"), - ("aria_text", "aria"), - ("qwen3_vl_moe_text", "qwen3_vl_moe"), - ("qwen3_vl_moe_vision", "qwen3_vl_moe"), - ("blenderbot-small", "blenderbot_small"), - ("ernie4_5_vl_moe_vision", "ernie4_5_vl_moe"), - ("ernie4_5_vl_moe_text", "ernie4_5_vl_moe"), - ("siglip_text_model", "siglip"), - ("siglip_vision_model", "siglip"), - ("flava_image_model", "flava"), - ("flava_text_model", "flava"), - ("flava_multimodal_model", "flava"), - ("modernbert-decoder", "modernbert_decoder"), - ("lasr_encoder", "lasr"), - ("lasr_ctc", "lasr"), - ("instructblip_vision_model", "instructblip"), - ("instructblip_qformer", "instructblip"), - ("internvl_vision", "internvl"), + ("qwen2_vl_vision", "qwen2_vl"), + ("qwen3_5_moe_text", "qwen3_5_moe"), + ("qwen3_5_moe_vision", "qwen3_5_moe"), + ("qwen3_5_text", "qwen3_5"), + ("qwen3_5_vision", "qwen3_5"), ("qwen3_omni_moe_audio_encoder", "qwen3_omni_moe"), - ("qwen3_omni_moe_vision_encoder", "qwen3_omni_moe"), - ("qwen3_omni_moe_text", "qwen3_omni_moe"), - ("qwen3_omni_moe_thinker", "qwen3_omni_moe"), ("qwen3_omni_moe_talker_code_predictor", "qwen3_omni_moe"), ("qwen3_omni_moe_talker_text", "qwen3_omni_moe"), - ("xclip_text_model", "x_clip"), - ("xclip_vision_model", "x_clip"), - ("xclip", "x_clip"), - ("audioflamingo3_encoder", "audioflamingo3"), - ("wav2vec2-bert", "wav2vec2_bert"), - ("encoder-decoder", "encoder_decoder"), - ("instructblipvideo_vision_model", "instructblipvideo"), - ("instructblipvideo_qformer", "instructblipvideo"), - ("emu3_vqgan", "emu3"), - ("emu3_text_model", "emu3"), - ("chinese_clip_text_model", "chinese_clip"), - ("chinese_clip_vision_model", "chinese_clip"), - ("owlv2_text_model", "owlv2"), - ("owlv2_vision_model", "owlv2"), - ("qwen3_5_moe_text", "qwen3_5_moe"), - ("qwen3_5_moe_vision", "qwen3_5_moe"), - ("lw_detr_vit", "lw_detr"), - ("donut-swin", "donut"), - ("moonshine_streaming_encoder", "moonshine_streaming"), - ("owlvit_text_model", "owlvit"), - ("owlvit_vision_model", "owlvit"), - ("glm_ocr_vision", "glm_ocr"), - ("glm_ocr_text", "glm_ocr"), - ("xlm-roberta", "xlm_roberta"), - ("unispeech-sat", "unispeech_sat"), - ("wav2vec2-conformer", "wav2vec2_conformer"), - ("blip_text_model", "blip"), - ("blip_vision_model", "blip"), - ("sam3_geometry_encoder", "sam3"), - ("sam3_detr_encoder", "sam3"), - ("sam3_detr_decoder", "sam3"), - ("sam3_mask_decoder", "sam3"), - ("granite_speech_encoder", "granite_speech"), - ("deberta-v2", "deberta_v2"), + ("qwen3_omni_moe_text", "qwen3_omni_moe"), + ("qwen3_omni_moe_thinker", "qwen3_omni_moe"), + ("qwen3_omni_moe_vision_encoder", "qwen3_omni_moe"), + ("qwen3_vl_moe_text", "qwen3_vl_moe"), + ("qwen3_vl_moe_vision", "qwen3_vl_moe"), + ("qwen3_vl_text", "qwen3_vl"), + ("qwen3_vl_vision", "qwen3_vl"), + ("roberta-prelayernorm", "roberta_prelayernorm"), + ("rt_detr_resnet", "rt_detr"), ("sam2_hiera_det_model", "sam2"), ("sam2_vision_model", "sam2"), - ("openai-gpt", "openai"), - ("csm_depth_decoder_model", "csm"), - ("align_text_model", "align"), - ("align_vision_model", "align"), - ("groupvit_text_model", "groupvit"), - ("groupvit_vision_model", "groupvit"), - ("mgp-str", "mgp_str"), - ("mm-grounding-dino", "mm_grounding_dino"), - ("git_vision_model", "git"), - ("musicgen_melody_decoder", "musicgen_melody"), - ("mllama_vision_model", "mllama"), - ("mllama_text_model", "mllama"), - ("mlcd_vision_model", "mlcd"), + ("sam3_detr_decoder", "sam3"), + ("sam3_detr_encoder", "sam3"), + ("sam3_geometry_encoder", "sam3"), + ("sam3_lite_text_detr_decoder", "sam3_lite_text"), + ("sam3_lite_text_detr_encoder", "sam3_lite_text"), + ("sam3_lite_text_geometry_encoder", "sam3_lite_text"), + ("sam3_lite_text_mask_decoder", "sam3_lite_text"), + ("sam3_lite_text_text_model", "sam3_lite_text"), + ("sam3_mask_decoder", "sam3"), + ("sam3_vision_model", "sam3"), + ("sam3_vit_model", "sam3"), + ("sam_hq_vision_model", "sam_hq"), + ("sam_vision_model", "sam"), ("sew-d", "sew_d"), - ("video_llama_3_vision", "video_llama_3"), - ("idefics3_vision", "idefics3"), - ("gemma4_audio", "gemma4"), - ("gemma4_text", "gemma4"), - ("gemma4_vision", "gemma4"), - ("qwen3_vl_vision", "qwen3_vl"), - ("qwen3_vl_text", "qwen3_vl"), - ("blip_2_vision_model", "blip_2"), - ("blip_2_qformer", "blip_2"), - ("blip-2", "blip_2"), - ("parakeet_encoder", "parakeet"), - ("parakeet_ctc", "parakeet"), + ("siglip2_text_model", "siglip2"), + ("siglip2_vision_model", "siglip2"), + ("siglip_text_model", "siglip"), + ("siglip_vision_model", "siglip"), + ("smolvlm_vision", "smolvlm"), + ("speech-encoder-decoder", "speech_encoder_decoder"), + ("speecht5_hifigan", "speecht5"), + ("t5_gemma_module", "t5gemma"), + ("t5gemma2_decoder", "t5gemma2"), + ("t5gemma2_encoder", "t5gemma2"), + ("t5gemma2_text", "t5gemma2"), ("table-transformer", "table_transformer"), - ("glm4v_moe_text", "glm4v_moe"), - ("glm4v_moe_vision", "glm4v_moe"), - ("paddleocr_vl_vision", "paddleocr_vl"), - ("paddleocr_vl_text", "paddleocr_vl"), + ("unispeech-sat", "unispeech_sat"), + ("uvdoc_backbone", "uvdoc"), + ("video_llama_3_vision", "video_llama_3"), + ("vision-encoder-decoder", "vision_encoder_decoder"), + ("vision-text-dual-encoder", "vision_text_dual_encoder"), + ("voxtral_encoder", "voxtral"), + ("voxtral_realtime_encoder", "voxtral_realtime"), + ("voxtral_realtime_text", "voxtral_realtime"), + ("wav2vec2-bert", "wav2vec2_bert"), + ("wav2vec2-conformer", "wav2vec2_conformer"), + ("xclip", "x_clip"), + ("xclip_text_model", "x_clip"), + ("xclip_vision_model", "x_clip"), + ("xlm-roberta", "xlm_roberta"), + ("xlm-roberta-xl", "xlm_roberta_xl"), ] ) diff --git a/utils/check_auto.py b/utils/check_auto.py index a7549969cd8f..fd0d27452e01 100644 --- a/utils/check_auto.py +++ b/utils/check_auto.py @@ -32,8 +32,8 @@ "name": "auto_mappings", "label": "Generate auto mappings", "file_globs": [], - "check_args": ["--fix_and_overwrite"], - "fix_args": [], + "check_args": [], + "fix_args": ["--fix_and_overwrite"], } AUTO_GENERATED_HADER = """# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 diff --git a/utils/sort_auto_mappings.py b/utils/sort_auto_mappings.py index cccae472a0a4..655f056eb376 100644 --- a/utils/sort_auto_mappings.py +++ b/utils/sort_auto_mappings.py @@ -46,9 +46,8 @@ PATH_TO_AUTO_MODULE = "src/transformers/models/auto" -# re pattern that matches mapping introductions: -# SUPER_MODEL_MAPPING_NAMES = OrderedDict or SUPER_MODEL_MAPPING = OrderedDict -_re_intro_mapping = re.compile(r"[A-Z_]+_MAPPING(\s+|_[A-Z_]+\s+)=\s+OrderedDict(?!\(\*)") +# re pattern that matches XXX_MAPPING_NAMES or SPECIAL_MODEL_TYPE_TO_MODULE_NAMES +_re_intro_mapping = re.compile(r"[A-Z_]+(_MAPPING|_MODEL_TYPE_TO_MODULE)(\s+|_[A-Z_]+\s+)=\s+OrderedDict(?!\(\*)") # re pattern that matches identifiers in mappings _re_identifier = re.compile(r'\s*\(\s*"(\S[^"]+)"')