diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py index 45c8c97c76eb..d24d32494c4f 100644 --- a/src/diffusers/loaders/__init__.py +++ b/src/diffusers/loaders/__init__.py @@ -60,7 +60,7 @@ def text_encoder_attn_modules(text_encoder): if is_transformers_available(): _import_structure["single_file"].extend(["FromSingleFileMixin"]) - _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin"] + _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin","ControlLoRAMixin"] _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"] _import_structure["ip_adapter"] = ["IPAdapterMixin"] @@ -73,7 +73,7 @@ def text_encoder_attn_modules(text_encoder): if is_transformers_available(): from .ip_adapter import IPAdapterMixin - from .lora import LoraLoaderMixin, StableDiffusionXLLoraLoaderMixin + from .lora import LoraLoaderMixin, StableDiffusionXLLoraLoaderMixin, ControlLoRAMixin from .single_file import FromSingleFileMixin from .textual_inversion import TextualInversionLoaderMixin else: diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py index 06eb3af05ee2..2567c74a2266 100644 --- a/src/diffusers/loaders/lora.py +++ b/src/diffusers/loaders/lora.py @@ -179,6 +179,7 @@ def load_lora_weights( def lora_state_dict( cls, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], + controlnet: bool=False, **kwargs, ): r""" @@ -310,20 +311,21 @@ def lora_state_dict( network_alphas = None # TODO: replace it with a method from `state_dict_utils` - if all( - ( - k.startswith("lora_te_") - or k.startswith("lora_unet_") - or k.startswith("lora_te1_") - or k.startswith("lora_te2_") - ) - for k in state_dict.keys() - ): - # Map SDXL blocks correctly. - if unet_config is not None: - # use unet config to remap block numbers - state_dict = cls._maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config) - state_dict, network_alphas = cls._convert_kohya_lora_to_diffusers(state_dict) + if not controlnet: + if all( + ( + k.startswith("lora_te_") + or k.startswith("lora_unet_") + or k.startswith("lora_te1_") + or k.startswith("lora_te2_") + ) + for k in state_dict.keys() + ): + # Map SDXL blocks correctly. + if unet_config is not None: + # use unet config to remap block numbers + state_dict = cls._maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config) + state_dict, network_alphas = cls._convert_kohya_lora_to_diffusers(state_dict) return state_dict, network_alphas @@ -1867,3 +1869,106 @@ def _remove_text_encoder_monkey_patch(self): else: self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder) self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2) + + +class ControlLoRAMixin(LoraLoaderMixin): + # Simplify ControlNet LoRA loading. + def load_lora_weights(self, pretrained_model_name_or_path_or_dict, **kwargs): + from ..models.lora import LoRACompatibleConv, LoRACompatibleLinear, LoRAConv2dLayer, LoRALinearLayer + from ..pipelines.stable_diffusion.convert_from_ckpt import convert_ldm_unet_checkpoint + + state_dict, _ = self.lora_state_dict(pretrained_model_name_or_path_or_dict, controlnet=True, **kwargs) + + controlnet_config = kwargs.pop("controlnet_config", None) + if controlnet_config is None: + raise ValueError("Must provide a `controlnet_config`.") + + # ControlNet LoRA has a mix of things. Some parameters correspond to LoRA and some correspond + # to the ones belonging to the original state_dict (initialized from the underlying UNet). + # So, we first map the LoRA parameters and then we load the remaining state_dict into + # the ControlNet. + converted_state_dict = convert_ldm_unet_checkpoint( + state_dict, controlnet=True, config=controlnet_config, skip_extract_state_dict=True, controlnet_lora=True + ) + + # Load whatever is matching. + load_state_dict_results = self.load_state_dict(converted_state_dict, strict=False) + if not all("lora" in k for k in load_state_dict_results.unexpected_keys): + raise ValueError( + f"The unexpected keys must only belong to LoRA parameters at this point, but found the following keys that are non-LoRA\n: {load_state_dict_results.unexpected_keys}" + ) + + # Filter out the rest of the state_dict for handling LoRA. + remaining_state_dict = { + k: v for k, v in converted_state_dict.items() if k in load_state_dict_results.unexpected_keys + } + + # Handle LoRA. + lora_grouped_dict = defaultdict(dict) + lora_layers_list = [] + + all_keys = list(remaining_state_dict.keys()) + for key in all_keys: + value = remaining_state_dict.pop(key) + attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:]) + lora_grouped_dict[attn_processor_key][sub_key] = value + + if len(remaining_state_dict) > 0: + raise ValueError( + f"The `remaining_state_dict` has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}" + ) + + for key, value_dict in lora_grouped_dict.items(): + attn_processor = self + for sub_key in key.split("."): + attn_processor = getattr(attn_processor, sub_key) + + # Process non-attention layers, which don't have to_{k,v,q,out_proj}_lora layers + # or add_{k,v,q,out_proj}_proj_lora layers. + rank = value_dict["lora.down.weight"].shape[0] + + if isinstance(attn_processor, LoRACompatibleConv): + in_features = attn_processor.in_channels + out_features = attn_processor.out_channels + kernel_size = attn_processor.kernel_size + + lora = LoRAConv2dLayer( + in_features=in_features, + out_features=out_features, + rank=rank, + kernel_size=kernel_size, + stride=attn_processor.stride, + padding=attn_processor.padding, + # initial_weight=attn_processor.weight, + # initial_bias=attn_processor.bias, + ) + elif isinstance(attn_processor, LoRACompatibleLinear): + lora = LoRALinearLayer( + attn_processor.in_features, + attn_processor.out_features, + rank, + # initial_weight=attn_processor.weight, + # initial_bias=attn_processor.bias, + ) + else: + raise ValueError(f"Module {key} is not a LoRACompatibleConv or LoRACompatibleLinear module.") + + value_dict = {k.replace("lora.", ""): v for k, v in value_dict.items()} + load_state_dict_results = lora.load_state_dict(value_dict, strict=False) + if not all("initial" in k for k in load_state_dict_results.unexpected_keys): + raise ValueError("Incorrect `value_dict` for the LoRA layer.") + lora_layers_list.append((attn_processor, lora)) + + # set correct dtype & device + lora_layers_list = [(t, l.to(device=self.device, dtype=self.dtype)) for t, l in lora_layers_list] + + # set lora layers + for target_module, lora_layer in lora_layers_list: + target_module.set_lora_layer(lora_layer) + + def unload_lora_weights(self): + for _, module in self.named_modules(): + if hasattr(module, "set_lora_layer"): + module.set_lora_layer(None) + + # Implement `fuse_lora()` and `unfuse_lora()` (sayakpaul). diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 220e34593c23..cafa6d4832dd 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -20,6 +20,8 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..loaders import FromOriginalControlnetMixin +from ..loaders import ControlLoRAMixin, FromOriginalControlnetMixin, UNet2DConditionLoadersMixin +from ..models.lora import LoRACompatibleConv from ..utils import BaseOutput, logging from .attention_processor import ( ADDED_KV_ATTENTION_PROCESSORS, @@ -80,7 +82,7 @@ def __init__( ): super().__init__() - self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) + self.conv_in = LoRACompatibleConv(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) self.blocks = nn.ModuleList([]) @@ -106,8 +108,9 @@ def forward(self, conditioning): return embedding - -class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin): +class ControlNetModel( + ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, FromOriginalControlnetMixin, ControlLoRAMixin +): """ A ControlNet model. @@ -250,7 +253,7 @@ def __init__( # input conv_in_kernel = 3 conv_in_padding = (conv_in_kernel - 1) // 2 - self.conv_in = nn.Conv2d( + self.conv_in = LoRACompatibleConv( in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding ) diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index a377ae267411..d5c672083707 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -18,6 +18,7 @@ import torch from torch import nn +from ..models.lora import LoRACompatibleLinear from ..utils import USE_PEFT_BACKEND from .activations import get_activation from .lora import LoRACompatibleLinear @@ -200,10 +201,10 @@ def __init__( super().__init__() linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear - self.linear_1 = linear_cls(in_channels, time_embed_dim) + self.linear_1 = LoRACompatibleLinear(in_channels, time_embed_dim) if cond_proj_dim is not None: - self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False) + self.cond_proj = LoRACompatibleLinear(cond_proj_dim, in_channels, bias=False) else: self.cond_proj = None @@ -213,7 +214,7 @@ def __init__( time_embed_dim_out = out_dim else: time_embed_dim_out = time_embed_dim - self.linear_2 = linear_cls(time_embed_dim, time_embed_dim_out) + self.linear_2 = LoRACompatibleLinear(time_embed_dim, time_embed_dim_out) if post_act_fn is None: self.post_act = None diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 35466f008f54..9bfbab0c7d7b 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -304,6 +304,8 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa class_embed_type = "projection" assert "adm_in_channels" in unet_params projection_class_embeddings_input_dim = unet_params.adm_in_channels + else: + raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}") config = { "sample_size": image_size // vae_scale_factor, @@ -321,12 +323,6 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa "transformer_layers_per_block": transformer_layers_per_block, } - if "disable_self_attentions" in unet_params: - config["only_cross_attention"] = unet_params.disable_self_attentions - - if "num_classes" in unet_params and isinstance(unet_params.num_classes, int): - config["num_class_embeds"] = unet_params.num_classes - if controlnet: config["conditioning_channels"] = unet_params.hint_channels else: @@ -381,11 +377,21 @@ def create_ldm_bert_config(original_config): def convert_ldm_unet_checkpoint( - checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False + checkpoint, + config, + path=None, + extract_ema=False, + controlnet=False, + skip_extract_state_dict=False, + controlnet_lora=False, ): """ Takes a state dict and a config, and returns a converted checkpoint. """ + if not controlnet and controlnet_lora: + raise ValueError(f"`controlnet_lora` cannot be done with `controlnet` set to {controlnet}.") + if controlnet and controlnet_lora: + skip_extract_state_dict = True if skip_extract_state_dict: unet_state_dict = checkpoint @@ -423,10 +429,22 @@ def convert_ldm_unet_checkpoint( new_checkpoint = {} - new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] - new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] - new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] - new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] + if controlnet_lora: + # Safe to pop as it doesn't have anything. + _ = unet_state_dict.pop("lora_controlnet") + + if not controlnet_lora: + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] + else: + new_checkpoint["time_embedding.linear_1.lora_down.weight"] = unet_state_dict["time_embed.0.down"] + new_checkpoint["time_embedding.linear_1.lora_up.weight"] = unet_state_dict["time_embed.0.up"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.lora_down.weight"] = unet_state_dict["time_embed.2.down"] + new_checkpoint["time_embedding.linear_2.lora_up.weight"] = unet_state_dict["time_embed.2.up"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] if config["class_embed_type"] is None: # No parameters to port @@ -440,17 +458,26 @@ def convert_ldm_unet_checkpoint( raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}") if config["addition_embed_type"] == "text_time": - new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"] - new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"] - new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"] - new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"] - - # Relevant to StableDiffusionUpscalePipeline - if "num_class_embeds" in config: - new_checkpoint["class_embedding.weight"] = unet_state_dict["label_emb.weight"] - - new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] - new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] + if not controlnet_lora: + new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"] + new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"] + new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"] + new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"] + else: + new_checkpoint["add_embedding.linear_1.lora_down.weight"] = unet_state_dict["label_emb.0.0.down"] + new_checkpoint["add_embedding.linear_1.lora_up.weight"] = unet_state_dict["label_emb.0.0.up"] + new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"] + new_checkpoint["add_embedding.linear_2.lora_down.weight"] = unet_state_dict["label_emb.0.2.down"] + new_checkpoint["add_embedding.linear_2.lora_up.weight"] = unet_state_dict["label_emb.0.2.up"] + new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"] + + if not controlnet_lora: + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] + new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] + else: + new_checkpoint["conv_in.lora_down.weight"] = unet_state_dict["input_blocks.0.0.down"] + new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] + new_checkpoint["conv_in.lora_up.weight"] = unet_state_dict["input_blocks.0.0.up"] if not controlnet: new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] @@ -504,7 +531,6 @@ def convert_ldm_unet_checkpoint( if len(attentions): paths = renew_attention_paths(attentions) - meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"} assign_to_checkpoint( paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config @@ -597,8 +623,9 @@ def convert_ldm_unet_checkpoint( orig_index += 2 diffusers_index = 0 + diffusers_index_limit = 6 - while diffusers_index < 6: + while diffusers_index < diffusers_index_limit: new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop( f"input_hint_block.{orig_index}.weight" ) @@ -608,12 +635,13 @@ def convert_ldm_unet_checkpoint( diffusers_index += 1 orig_index += 2 - new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.weight" - ) - new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.bias" - ) + if not controlnet_lora: + new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.weight" + ) + new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.bias" + ) # down blocks for i in range(num_input_blocks): @@ -624,6 +652,21 @@ def convert_ldm_unet_checkpoint( new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight") new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias") + if controlnet_lora: + modified_new_checkpoint = {} + down_pattern = r"\.down$" + up_pattern = r"\.up$" + + for key in new_checkpoint: + new_key = key + new_key = re.sub(down_pattern, ".lora.down.weight", new_key) + new_key = re.sub(up_pattern, ".lora.up.weight", new_key) + new_key = new_key.replace("lora_down", "lora.down") + new_key = new_key.replace("lora_up", "lora.up") + modified_new_checkpoint[new_key] = new_checkpoint[key] + + new_checkpoint = modified_new_checkpoint + return new_checkpoint @@ -787,12 +830,7 @@ def _copy_layers(hf_layers, pt_layers): def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None): if text_encoder is None: config_name = "openai/clip-vit-large-patch14" - try: - config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'." - ) + config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only) ctx = init_empty_weights if is_accelerate_available() else nullcontext with ctx(): @@ -927,12 +965,7 @@ def convert_open_clip_checkpoint( # text_model = CLIPTextModelWithProjection.from_pretrained( # "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280 # ) - try: - config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: '{config_name}'." - ) + config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only) ctx = init_empty_weights if is_accelerate_available() else nullcontext with ctx(): @@ -1145,7 +1178,6 @@ def download_from_original_stable_diffusion_ckpt( stable_unclip_prior: Optional[str] = None, clip_stats_path: Optional[str] = None, controlnet: Optional[bool] = None, - adapter: Optional[bool] = None, load_safety_checker: bool = True, pipeline_class: DiffusionPipeline = None, local_files_only=False, @@ -1230,13 +1262,14 @@ def download_from_original_stable_diffusion_ckpt( StableDiffusionControlNetPipeline, StableDiffusionInpaintPipeline, StableDiffusionPipeline, - StableDiffusionUpscalePipeline, StableDiffusionXLImg2ImgPipeline, - StableDiffusionXLPipeline, StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline, ) + if pipeline_class is None: + pipeline_class = StableDiffusionPipeline if not controlnet else StableDiffusionControlNetPipeline + if prediction_type == "v-prediction": prediction_type = "v_prediction" @@ -1275,43 +1308,25 @@ def download_from_original_stable_diffusion_ckpt( key_name_v2_1 = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight" key_name_sd_xl_base = "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias" key_name_sd_xl_refiner = "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias" - is_upscale = pipeline_class == StableDiffusionUpscalePipeline - - config_url = None # model_type = "v1" - if config_files is not None and "v1" in config_files: - original_config_file = config_files["v1"] - else: - config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" + config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" if key_name_v2_1 in checkpoint and checkpoint[key_name_v2_1].shape[-1] == 1024: # model_type = "v2" - if config_files is not None and "v2" in config_files: - original_config_file = config_files["v2"] - else: - config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml" + config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml" + if global_step == 110000: # v2.1 needs to upcast attention upcast_attention = True elif key_name_sd_xl_base in checkpoint: # only base xl has two text embedders - if config_files is not None and "xl" in config_files: - original_config_file = config_files["xl"] - else: - config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml" + config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml" elif key_name_sd_xl_refiner in checkpoint: # only refiner xl has embedder and one text embedders - if config_files is not None and "xl_refiner" in config_files: - original_config_file = config_files["xl_refiner"] - else: - config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml" + config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml" - if is_upscale: - config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml" - - if config_url is not None: - original_config_file = BytesIO(requests.get(config_url).content) + original_config_file = BytesIO(requests.get(config_url).content) original_config = OmegaConf.load(original_config_file) @@ -1331,17 +1346,8 @@ def download_from_original_stable_diffusion_ckpt( if image_size is None: image_size = 1024 - if pipeline_class is None: - # Check if we have a SDXL or SD model and initialize default pipeline - if model_type not in ["SDXL", "SDXL-Refiner"]: - pipeline_class = StableDiffusionPipeline if not controlnet else StableDiffusionControlNetPipeline - else: - pipeline_class = StableDiffusionXLPipeline if model_type == "SDXL" else StableDiffusionXLImg2ImgPipeline - if num_in_channels is None and pipeline_class == StableDiffusionInpaintPipeline: num_in_channels = 9 - if num_in_channels is None and pipeline_class == StableDiffusionUpscalePipeline: - num_in_channels = 7 elif num_in_channels is None: num_in_channels = 4 @@ -1425,13 +1431,9 @@ def download_from_original_stable_diffusion_ckpt( else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") - if pipeline_class == StableDiffusionUpscalePipeline: - image_size = original_config.model.params.unet_config.params.image_size - # Convert the UNet2DConditionModel model. unet_config = create_unet_diffusers_config(original_config, image_size=image_size) unet_config["upcast_attention"] = upcast_attention - path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else "" converted_unet_checkpoint = convert_ldm_unet_checkpoint( checkpoint, unet_config, path=path, extract_ema=extract_ema @@ -1480,19 +1482,11 @@ def download_from_original_stable_diffusion_ckpt( config_name = "stabilityai/stable-diffusion-2" config_kwargs = {"subfolder": "text_encoder"} - text_model = convert_open_clip_checkpoint( - checkpoint, config_name, local_files_only=local_files_only, **config_kwargs + text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs) + tokenizer = CLIPTokenizer.from_pretrained( + "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only ) - try: - tokenizer = CLIPTokenizer.from_pretrained( - "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'stabilityai/stable-diffusion-2'." - ) - if stable_unclip is None: if controlnet: pipe = pipeline_class( @@ -1504,29 +1498,8 @@ def download_from_original_stable_diffusion_ckpt( controlnet=controlnet, safety_checker=None, feature_extractor=None, + requires_safety_checker=False, ) - if hasattr(pipe, "requires_safety_checker"): - pipe.requires_safety_checker = False - - elif pipeline_class == StableDiffusionUpscalePipeline: - scheduler = DDIMScheduler.from_pretrained( - "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler" - ) - low_res_scheduler = DDPMScheduler.from_pretrained( - "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler" - ) - - pipe = pipeline_class( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - low_res_scheduler=low_res_scheduler, - safety_checker=None, - feature_extractor=None, - ) - else: pipe = pipeline_class( vae=vae, @@ -1536,10 +1509,8 @@ def download_from_original_stable_diffusion_ckpt( scheduler=scheduler, safety_checker=None, feature_extractor=None, + requires_safety_checker=False, ) - if hasattr(pipe, "requires_safety_checker"): - pipe.requires_safety_checker = False - else: image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components( original_config, clip_stats_path=clip_stats_path, device=device @@ -1570,14 +1541,9 @@ def download_from_original_stable_diffusion_ckpt( karlo_model, subfolder="prior", local_files_only=local_files_only ) - try: - prior_tokenizer = CLIPTokenizer.from_pretrained( - "openai/clip-vit-large-patch14", local_files_only=local_files_only - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'." - ) + prior_tokenizer = CLIPTokenizer.from_pretrained( + "openai/clip-vit-large-patch14", local_files_only=local_files_only + ) prior_text_model = CLIPTextModelWithProjection.from_pretrained( "openai/clip-vit-large-patch14", local_files_only=local_files_only ) @@ -1610,22 +1576,10 @@ def download_from_original_stable_diffusion_ckpt( raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}") elif model_type == "PaintByExample": vision_model = convert_paint_by_example_checkpoint(checkpoint) - try: - tokenizer = CLIPTokenizer.from_pretrained( - "openai/clip-vit-large-patch14", local_files_only=local_files_only - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'." - ) - try: - feature_extractor = AutoFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the feature_extractor in the following path: 'CompVis/stable-diffusion-safety-checker'." - ) + tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only) + feature_extractor = AutoFeatureExtractor.from_pretrained( + "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only + ) pipe = PaintByExamplePipeline( vae=vae, image_encoder=vision_model, @@ -1638,16 +1592,11 @@ def download_from_original_stable_diffusion_ckpt( text_model = convert_ldm_clip_checkpoint( checkpoint, local_files_only=local_files_only, text_encoder=text_encoder ) - try: - tokenizer = ( - CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only) - if tokenizer is None - else tokenizer - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'." - ) + tokenizer = ( + CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only) + if tokenizer is None + else tokenizer + ) if load_safety_checker: safety_checker = StableDiffusionSafetyChecker.from_pretrained( @@ -1683,33 +1632,18 @@ def download_from_original_stable_diffusion_ckpt( ) elif model_type in ["SDXL", "SDXL-Refiner"]: if model_type == "SDXL": - try: - tokenizer = CLIPTokenizer.from_pretrained( - "openai/clip-vit-large-patch14", local_files_only=local_files_only - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'." - ) + tokenizer = CLIPTokenizer.from_pretrained( + "openai/clip-vit-large-patch14", local_files_only=local_files_only + ) text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only) - try: - tokenizer_2 = CLIPTokenizer.from_pretrained( - "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'." - ) + tokenizer_2 = CLIPTokenizer.from_pretrained( + "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only + ) config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" config_kwargs = {"projection_dim": 1280} text_encoder_2 = convert_open_clip_checkpoint( - checkpoint, - config_name, - prefix="conditioner.embedders.1.model.", - has_projection=True, - local_files_only=local_files_only, - **config_kwargs, + checkpoint, config_name, prefix="conditioner.embedders.1.model.", has_projection=True, **config_kwargs ) if is_accelerate_available(): # SBM Now move model to cpu. @@ -1729,18 +1663,6 @@ def download_from_original_stable_diffusion_ckpt( scheduler=scheduler, force_zeros_for_empty_prompt=True, ) - elif adapter: - pipe = pipeline_class( - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - text_encoder_2=text_encoder_2, - tokenizer_2=tokenizer_2, - unet=unet, - adapter=adapter, - scheduler=scheduler, - force_zeros_for_empty_prompt=True, - ) else: pipe = pipeline_class( vae=vae, @@ -1755,23 +1677,14 @@ def download_from_original_stable_diffusion_ckpt( else: tokenizer = None text_encoder = None - try: - tokenizer_2 = CLIPTokenizer.from_pretrained( - "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only - ) - except Exception: - raise ValueError( - f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'." - ) + tokenizer_2 = CLIPTokenizer.from_pretrained( + "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only + ) + config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" config_kwargs = {"projection_dim": 1280} text_encoder_2 = convert_open_clip_checkpoint( - checkpoint, - config_name, - prefix="conditioner.embedders.0.model.", - has_projection=True, - local_files_only=local_files_only, - **config_kwargs, + checkpoint, config_name, prefix="conditioner.embedders.0.model.", has_projection=True, **config_kwargs ) if is_accelerate_available(): # SBM Now move model to cpu.