From 33441a8ab1f055e285479abcecbe207f4440cb77 Mon Sep 17 00:00:00 2001 From: Teriks Date: Tue, 11 Feb 2025 14:39:42 -0600 Subject: [PATCH 1/3] Fix SD2.X clip single file load projection_dim Infer projection_dim from the checkpoint before loading from pretrained, override any incorrect hub config. Hub configuration for SD2.X specifies projection_dim=512 which is incorrect for SD2.X checkpoints loaded from civitai and similar. Exception was previously thrown upon attempting to load_model_dict_into_meta for SD2.X single file checkpoints. Such LDM models usually require projection_dim=1024 --- src/diffusers/loaders/single_file_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 59060efade8b..3d04f0d30d77 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -1545,6 +1545,14 @@ def create_diffusers_clip_model_from_ldm( config["pretrained_model_name_or_path"] = clip_config subfolder = "" + if is_open_clip_model(checkpoint): + # infer projection_dim for the text_encoder using the checkpoint. + # should fix SD2.X LDM checkpoint loads from CivitAI and similar. + # The configuration on the hub is often (or always) incorrect for these models + # which need projection_dim=1024 and not projection_dim=512 + if 'cond_stage_model.model.transformer.resblocks.0.mlp.c_proj.weight' in checkpoint: + config['projection_dim'] = checkpoint['cond_stage_model.model.transformer.resblocks.0.mlp.c_proj.weight'].shape[0] + model_config = cls.config_class.from_pretrained(**config, subfolder=subfolder, local_files_only=local_files_only) ctx = init_empty_weights if is_accelerate_available() else nullcontext with ctx(): From d25c76a5cfa4a9d45f6976791a545896e4131858 Mon Sep 17 00:00:00 2001 From: Teriks Date: Mon, 3 Mar 2025 01:38:54 -0600 Subject: [PATCH 2/3] convert_open_clip_checkpoint use hidden_size for text_proj_dim --- src/diffusers/loaders/single_file_utils.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 3d04f0d30d77..aafc15ecc617 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -1447,9 +1447,9 @@ def convert_open_clip_checkpoint( text_proj_key = prefix + "text_projection" if text_proj_key in checkpoint: - text_proj_dim = int(checkpoint[text_proj_key].shape[0]) - elif hasattr(text_model.config, "projection_dim"): - text_proj_dim = text_model.config.projection_dim + text_proj_dim = int(checkpoint[text_proj_key].shape[1]) + elif hasattr(text_model.config, "hidden_size"): + text_proj_dim = text_model.config.hidden_size else: text_proj_dim = LDM_OPEN_CLIP_TEXT_PROJECTION_DIM @@ -1545,14 +1545,6 @@ def create_diffusers_clip_model_from_ldm( config["pretrained_model_name_or_path"] = clip_config subfolder = "" - if is_open_clip_model(checkpoint): - # infer projection_dim for the text_encoder using the checkpoint. - # should fix SD2.X LDM checkpoint loads from CivitAI and similar. - # The configuration on the hub is often (or always) incorrect for these models - # which need projection_dim=1024 and not projection_dim=512 - if 'cond_stage_model.model.transformer.resblocks.0.mlp.c_proj.weight' in checkpoint: - config['projection_dim'] = checkpoint['cond_stage_model.model.transformer.resblocks.0.mlp.c_proj.weight'].shape[0] - model_config = cls.config_class.from_pretrained(**config, subfolder=subfolder, local_files_only=local_files_only) ctx = init_empty_weights if is_accelerate_available() else nullcontext with ctx(): From c88b6c1e2159553f23eacb24a0fcbb2af2a2e3f7 Mon Sep 17 00:00:00 2001 From: Teriks Date: Mon, 3 Mar 2025 05:45:23 -0600 Subject: [PATCH 3/3] convert_open_clip_checkpoint, revert checkpoint[text_proj_key].shape[1] -> [0] values are identical --- src/diffusers/loaders/single_file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index aafc15ecc617..cc421d0291d9 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -1447,7 +1447,7 @@ def convert_open_clip_checkpoint( text_proj_key = prefix + "text_projection" if text_proj_key in checkpoint: - text_proj_dim = int(checkpoint[text_proj_key].shape[1]) + text_proj_dim = int(checkpoint[text_proj_key].shape[0]) elif hasattr(text_model.config, "hidden_size"): text_proj_dim = text_model.config.hidden_size else: