diff --git a/src/diffusers/loaders/__init__.py b/src/diffusers/loaders/__init__.py
index 45c8c97c76eb..d24d32494c4f 100644
--- a/src/diffusers/loaders/__init__.py
+++ b/src/diffusers/loaders/__init__.py
@@ -60,7 +60,7 @@ def text_encoder_attn_modules(text_encoder):
 
     if is_transformers_available():
         _import_structure["single_file"].extend(["FromSingleFileMixin"])
-        _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin"]
+        _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin","ControlLoRAMixin"]
         _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
         _import_structure["ip_adapter"] = ["IPAdapterMixin"]
 
@@ -73,7 +73,7 @@ def text_encoder_attn_modules(text_encoder):
 
         if is_transformers_available():
             from .ip_adapter import IPAdapterMixin
-            from .lora import LoraLoaderMixin, StableDiffusionXLLoraLoaderMixin
+            from .lora import LoraLoaderMixin, StableDiffusionXLLoraLoaderMixin, ControlLoRAMixin
             from .single_file import FromSingleFileMixin
             from .textual_inversion import TextualInversionLoaderMixin
 else:
diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py
index 06eb3af05ee2..2567c74a2266 100644
--- a/src/diffusers/loaders/lora.py
+++ b/src/diffusers/loaders/lora.py
@@ -179,6 +179,7 @@ def load_lora_weights(
     def lora_state_dict(
         cls,
         pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        controlnet: bool=False,
         **kwargs,
     ):
         r"""
@@ -310,20 +311,21 @@ def lora_state_dict(
 
         network_alphas = None
         # TODO: replace it with a method from `state_dict_utils`
-        if all(
-            (
-                k.startswith("lora_te_")
-                or k.startswith("lora_unet_")
-                or k.startswith("lora_te1_")
-                or k.startswith("lora_te2_")
-            )
-            for k in state_dict.keys()
-        ):
-            # Map SDXL blocks correctly.
-            if unet_config is not None:
-                # use unet config to remap block numbers
-                state_dict = cls._maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
-            state_dict, network_alphas = cls._convert_kohya_lora_to_diffusers(state_dict)
+        if not controlnet:
+            if all(
+                (
+                    k.startswith("lora_te_")
+                    or k.startswith("lora_unet_")
+                    or k.startswith("lora_te1_")
+                    or k.startswith("lora_te2_")
+                )
+                for k in state_dict.keys()
+            ):
+                # Map SDXL blocks correctly.
+                if unet_config is not None:
+                    # use unet config to remap block numbers
+                    state_dict = cls._maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
+                state_dict, network_alphas = cls._convert_kohya_lora_to_diffusers(state_dict)
 
         return state_dict, network_alphas
 
@@ -1867,3 +1869,106 @@ def _remove_text_encoder_monkey_patch(self):
         else:
             self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
             self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
+
+
+class ControlLoRAMixin(LoraLoaderMixin):
+    # Simplify ControlNet LoRA loading.
+    def load_lora_weights(self, pretrained_model_name_or_path_or_dict, **kwargs):
+        from ..models.lora import LoRACompatibleConv, LoRACompatibleLinear, LoRAConv2dLayer, LoRALinearLayer
+        from ..pipelines.stable_diffusion.convert_from_ckpt import convert_ldm_unet_checkpoint
+
+        state_dict, _ = self.lora_state_dict(pretrained_model_name_or_path_or_dict, controlnet=True, **kwargs)
+        
+        controlnet_config = kwargs.pop("controlnet_config", None)
+        if controlnet_config is None:
+            raise ValueError("Must provide a `controlnet_config`.")
+
+        # ControlNet LoRA has a mix of things. Some parameters correspond to LoRA and some correspond
+        # to the ones belonging to the original state_dict (initialized from the underlying UNet).
+        # So, we first map the LoRA parameters and then we load the remaining state_dict into
+        # the ControlNet.
+        converted_state_dict = convert_ldm_unet_checkpoint(
+            state_dict, controlnet=True, config=controlnet_config, skip_extract_state_dict=True, controlnet_lora=True
+        )
+
+        # Load whatever is matching.
+        load_state_dict_results = self.load_state_dict(converted_state_dict, strict=False)
+        if not all("lora" in k for k in load_state_dict_results.unexpected_keys):
+            raise ValueError(
+                f"The unexpected keys must only belong to LoRA parameters at this point, but found the following keys that are non-LoRA\n: {load_state_dict_results.unexpected_keys}"
+            )
+
+        # Filter out the rest of the state_dict for handling LoRA.
+        remaining_state_dict = {
+            k: v for k, v in converted_state_dict.items() if k in load_state_dict_results.unexpected_keys
+        }
+
+        # Handle LoRA.
+        lora_grouped_dict = defaultdict(dict)
+        lora_layers_list = []
+
+        all_keys = list(remaining_state_dict.keys())
+        for key in all_keys:
+            value = remaining_state_dict.pop(key)
+            attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+            lora_grouped_dict[attn_processor_key][sub_key] = value
+
+        if len(remaining_state_dict) > 0:
+            raise ValueError(
+                f"The `remaining_state_dict` has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
+            )
+
+        for key, value_dict in lora_grouped_dict.items():
+            attn_processor = self
+            for sub_key in key.split("."):
+                attn_processor = getattr(attn_processor, sub_key)
+
+            # Process non-attention layers, which don't have to_{k,v,q,out_proj}_lora layers
+            # or add_{k,v,q,out_proj}_proj_lora layers.
+            rank = value_dict["lora.down.weight"].shape[0]
+
+            if isinstance(attn_processor, LoRACompatibleConv):
+                in_features = attn_processor.in_channels
+                out_features = attn_processor.out_channels
+                kernel_size = attn_processor.kernel_size
+
+                lora = LoRAConv2dLayer(
+                    in_features=in_features,
+                    out_features=out_features,
+                    rank=rank,
+                    kernel_size=kernel_size,
+                    stride=attn_processor.stride,
+                    padding=attn_processor.padding,
+                    # initial_weight=attn_processor.weight,
+                    # initial_bias=attn_processor.bias,
+                )
+            elif isinstance(attn_processor, LoRACompatibleLinear):
+                lora = LoRALinearLayer(
+                    attn_processor.in_features,
+                    attn_processor.out_features,
+                    rank,
+                    # initial_weight=attn_processor.weight,
+                    # initial_bias=attn_processor.bias,
+                )
+            else:
+                raise ValueError(f"Module {key} is not a LoRACompatibleConv or LoRACompatibleLinear module.")
+
+            value_dict = {k.replace("lora.", ""): v for k, v in value_dict.items()}
+            load_state_dict_results = lora.load_state_dict(value_dict, strict=False)
+            if not all("initial" in k for k in load_state_dict_results.unexpected_keys):
+                raise ValueError("Incorrect `value_dict` for the LoRA layer.")
+            lora_layers_list.append((attn_processor, lora))
+
+            # set correct dtype & device
+            lora_layers_list = [(t, l.to(device=self.device, dtype=self.dtype)) for t, l in lora_layers_list]
+
+            # set lora layers
+            for target_module, lora_layer in lora_layers_list:
+                target_module.set_lora_layer(lora_layer)
+
+    def unload_lora_weights(self):
+        for _, module in self.named_modules():
+            if hasattr(module, "set_lora_layer"):
+                module.set_lora_layer(None)
+
+    # Implement `fuse_lora()` and `unfuse_lora()` (sayakpaul).
diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 220e34593c23..cafa6d4832dd 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -20,6 +20,8 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import FromOriginalControlnetMixin
+from ..loaders import ControlLoRAMixin, FromOriginalControlnetMixin, UNet2DConditionLoadersMixin
+from ..models.lora import LoRACompatibleConv
 from ..utils import BaseOutput, logging
 from .attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -80,7 +82,7 @@ def __init__(
     ):
         super().__init__()
 
-        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+        self.conv_in = LoRACompatibleConv(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
 
         self.blocks = nn.ModuleList([])
 
@@ -106,8 +108,9 @@ def forward(self, conditioning):
 
         return embedding
 
-
-class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
+class ControlNetModel(
+    ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, FromOriginalControlnetMixin, ControlLoRAMixin
+):
     """
     A ControlNet model.
 
@@ -250,7 +253,7 @@ def __init__(
         # input
         conv_in_kernel = 3
         conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = nn.Conv2d(
+        self.conv_in = LoRACompatibleConv(
             in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
         )
 
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index a377ae267411..d5c672083707 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -18,6 +18,7 @@
 import torch
 from torch import nn
 
+from ..models.lora import LoRACompatibleLinear
 from ..utils import USE_PEFT_BACKEND
 from .activations import get_activation
 from .lora import LoRACompatibleLinear
@@ -200,10 +201,10 @@ def __init__(
         super().__init__()
         linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
 
-        self.linear_1 = linear_cls(in_channels, time_embed_dim)
+        self.linear_1 = LoRACompatibleLinear(in_channels, time_embed_dim)
 
         if cond_proj_dim is not None:
-            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+            self.cond_proj = LoRACompatibleLinear(cond_proj_dim, in_channels, bias=False)
         else:
             self.cond_proj = None
 
@@ -213,7 +214,7 @@ def __init__(
             time_embed_dim_out = out_dim
         else:
             time_embed_dim_out = time_embed_dim
-        self.linear_2 = linear_cls(time_embed_dim, time_embed_dim_out)
+        self.linear_2 = LoRACompatibleLinear(time_embed_dim, time_embed_dim_out)
 
         if post_act_fn is None:
             self.post_act = None
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 35466f008f54..9bfbab0c7d7b 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -304,6 +304,8 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
                 class_embed_type = "projection"
             assert "adm_in_channels" in unet_params
             projection_class_embeddings_input_dim = unet_params.adm_in_channels
+        else:
+            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
 
     config = {
         "sample_size": image_size // vae_scale_factor,
@@ -321,12 +323,6 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
         "transformer_layers_per_block": transformer_layers_per_block,
     }
 
-    if "disable_self_attentions" in unet_params:
-        config["only_cross_attention"] = unet_params.disable_self_attentions
-
-    if "num_classes" in unet_params and isinstance(unet_params.num_classes, int):
-        config["num_class_embeds"] = unet_params.num_classes
-
     if controlnet:
         config["conditioning_channels"] = unet_params.hint_channels
     else:
@@ -381,11 +377,21 @@ def create_ldm_bert_config(original_config):
 
 
 def convert_ldm_unet_checkpoint(
-    checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
+    checkpoint,
+    config,
+    path=None,
+    extract_ema=False,
+    controlnet=False,
+    skip_extract_state_dict=False,
+    controlnet_lora=False,
 ):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
+    if not controlnet and controlnet_lora:
+        raise ValueError(f"`controlnet_lora` cannot be done with `controlnet` set to {controlnet}.")
+    if controlnet and controlnet_lora:
+        skip_extract_state_dict = True
 
     if skip_extract_state_dict:
         unet_state_dict = checkpoint
@@ -423,10 +429,22 @@ def convert_ldm_unet_checkpoint(
 
     new_checkpoint = {}
 
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    if controlnet_lora:
+        # Safe to pop as it doesn't have anything.
+        _ = unet_state_dict.pop("lora_controlnet")
+
+    if not controlnet_lora:
+        new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+        new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+        new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+        new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    else:
+        new_checkpoint["time_embedding.linear_1.lora_down.weight"] = unet_state_dict["time_embed.0.down"]
+        new_checkpoint["time_embedding.linear_1.lora_up.weight"] = unet_state_dict["time_embed.0.up"]
+        new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+        new_checkpoint["time_embedding.linear_2.lora_down.weight"] = unet_state_dict["time_embed.2.down"]
+        new_checkpoint["time_embedding.linear_2.lora_up.weight"] = unet_state_dict["time_embed.2.up"]
+        new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
 
     if config["class_embed_type"] is None:
         # No parameters to port
@@ -440,17 +458,26 @@ def convert_ldm_unet_checkpoint(
         raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
 
     if config["addition_embed_type"] == "text_time":
-        new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
-        new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
-        new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
-        new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
-
-    # Relevant to StableDiffusionUpscalePipeline
-    if "num_class_embeds" in config:
-        new_checkpoint["class_embedding.weight"] = unet_state_dict["label_emb.weight"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+        if not controlnet_lora:
+            new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+            new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+            new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+            new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+        else:
+            new_checkpoint["add_embedding.linear_1.lora_down.weight"] = unet_state_dict["label_emb.0.0.down"]
+            new_checkpoint["add_embedding.linear_1.lora_up.weight"] = unet_state_dict["label_emb.0.0.up"]
+            new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+            new_checkpoint["add_embedding.linear_2.lora_down.weight"] = unet_state_dict["label_emb.0.2.down"]
+            new_checkpoint["add_embedding.linear_2.lora_up.weight"] = unet_state_dict["label_emb.0.2.up"]
+            new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+
+    if not controlnet_lora:
+        new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+        new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    else:
+        new_checkpoint["conv_in.lora_down.weight"] = unet_state_dict["input_blocks.0.0.down"]
+        new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+        new_checkpoint["conv_in.lora_up.weight"] = unet_state_dict["input_blocks.0.0.up"]
 
     if not controlnet:
         new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
@@ -504,7 +531,6 @@ def convert_ldm_unet_checkpoint(
 
         if len(attentions):
             paths = renew_attention_paths(attentions)
-
             meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
             assign_to_checkpoint(
                 paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
@@ -597,8 +623,9 @@ def convert_ldm_unet_checkpoint(
         orig_index += 2
 
         diffusers_index = 0
+        diffusers_index_limit = 6
 
-        while diffusers_index < 6:
+        while diffusers_index < diffusers_index_limit:
             new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
                 f"input_hint_block.{orig_index}.weight"
             )
@@ -608,12 +635,13 @@ def convert_ldm_unet_checkpoint(
             diffusers_index += 1
             orig_index += 2
 
-        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.weight"
-        )
-        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
-            f"input_hint_block.{orig_index}.bias"
-        )
+        if not controlnet_lora:
+            new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
 
         # down blocks
         for i in range(num_input_blocks):
@@ -624,6 +652,21 @@ def convert_ldm_unet_checkpoint(
         new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
         new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
 
+    if controlnet_lora:
+        modified_new_checkpoint = {}
+        down_pattern = r"\.down$"
+        up_pattern = r"\.up$"
+
+        for key in new_checkpoint:
+            new_key = key
+            new_key = re.sub(down_pattern, ".lora.down.weight", new_key)
+            new_key = re.sub(up_pattern, ".lora.up.weight", new_key)
+            new_key = new_key.replace("lora_down", "lora.down")
+            new_key = new_key.replace("lora_up", "lora.up")
+            modified_new_checkpoint[new_key] = new_checkpoint[key]
+
+        new_checkpoint = modified_new_checkpoint
+
     return new_checkpoint
 
 
@@ -787,12 +830,7 @@ def _copy_layers(hf_layers, pt_layers):
 def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
     if text_encoder is None:
         config_name = "openai/clip-vit-large-patch14"
-        try:
-            config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
-            )
+        config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
 
         ctx = init_empty_weights if is_accelerate_available() else nullcontext
         with ctx():
@@ -927,12 +965,7 @@ def convert_open_clip_checkpoint(
     # text_model = CLIPTextModelWithProjection.from_pretrained(
     #    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
     # )
-    try:
-        config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
-    except Exception:
-        raise ValueError(
-            f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: '{config_name}'."
-        )
+    config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
 
     ctx = init_empty_weights if is_accelerate_available() else nullcontext
     with ctx():
@@ -1145,7 +1178,6 @@ def download_from_original_stable_diffusion_ckpt(
     stable_unclip_prior: Optional[str] = None,
     clip_stats_path: Optional[str] = None,
     controlnet: Optional[bool] = None,
-    adapter: Optional[bool] = None,
     load_safety_checker: bool = True,
     pipeline_class: DiffusionPipeline = None,
     local_files_only=False,
@@ -1230,13 +1262,14 @@ def download_from_original_stable_diffusion_ckpt(
         StableDiffusionControlNetPipeline,
         StableDiffusionInpaintPipeline,
         StableDiffusionPipeline,
-        StableDiffusionUpscalePipeline,
         StableDiffusionXLImg2ImgPipeline,
-        StableDiffusionXLPipeline,
         StableUnCLIPImg2ImgPipeline,
         StableUnCLIPPipeline,
     )
 
+    if pipeline_class is None:
+        pipeline_class = StableDiffusionPipeline if not controlnet else StableDiffusionControlNetPipeline
+
     if prediction_type == "v-prediction":
         prediction_type = "v_prediction"
 
@@ -1275,43 +1308,25 @@ def download_from_original_stable_diffusion_ckpt(
         key_name_v2_1 = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
         key_name_sd_xl_base = "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias"
         key_name_sd_xl_refiner = "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias"
-        is_upscale = pipeline_class == StableDiffusionUpscalePipeline
-
-        config_url = None
 
         # model_type = "v1"
-        if config_files is not None and "v1" in config_files:
-            original_config_file = config_files["v1"]
-        else:
-            config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+        config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
 
         if key_name_v2_1 in checkpoint and checkpoint[key_name_v2_1].shape[-1] == 1024:
             # model_type = "v2"
-            if config_files is not None and "v2" in config_files:
-                original_config_file = config_files["v2"]
-            else:
-                config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+            config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+
             if global_step == 110000:
                 # v2.1 needs to upcast attention
                 upcast_attention = True
         elif key_name_sd_xl_base in checkpoint:
             # only base xl has two text embedders
-            if config_files is not None and "xl" in config_files:
-                original_config_file = config_files["xl"]
-            else:
-                config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
+            config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
         elif key_name_sd_xl_refiner in checkpoint:
             # only refiner xl has embedder and one text embedders
-            if config_files is not None and "xl_refiner" in config_files:
-                original_config_file = config_files["xl_refiner"]
-            else:
-                config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml"
+            config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml"
 
-        if is_upscale:
-            config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml"
-
-        if config_url is not None:
-            original_config_file = BytesIO(requests.get(config_url).content)
+        original_config_file = BytesIO(requests.get(config_url).content)
 
     original_config = OmegaConf.load(original_config_file)
 
@@ -1331,17 +1346,8 @@ def download_from_original_stable_diffusion_ckpt(
         if image_size is None:
             image_size = 1024
 
-    if pipeline_class is None:
-        # Check if we have a SDXL or SD model and initialize default pipeline
-        if model_type not in ["SDXL", "SDXL-Refiner"]:
-            pipeline_class = StableDiffusionPipeline if not controlnet else StableDiffusionControlNetPipeline
-        else:
-            pipeline_class = StableDiffusionXLPipeline if model_type == "SDXL" else StableDiffusionXLImg2ImgPipeline
-
     if num_in_channels is None and pipeline_class == StableDiffusionInpaintPipeline:
         num_in_channels = 9
-    if num_in_channels is None and pipeline_class == StableDiffusionUpscalePipeline:
-        num_in_channels = 7
     elif num_in_channels is None:
         num_in_channels = 4
 
@@ -1425,13 +1431,9 @@ def download_from_original_stable_diffusion_ckpt(
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
-    if pipeline_class == StableDiffusionUpscalePipeline:
-        image_size = original_config.model.params.unet_config.params.image_size
-
     # Convert the UNet2DConditionModel model.
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
     unet_config["upcast_attention"] = upcast_attention
-
     path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(
         checkpoint, unet_config, path=path, extract_ema=extract_ema
@@ -1480,19 +1482,11 @@ def download_from_original_stable_diffusion_ckpt(
         config_name = "stabilityai/stable-diffusion-2"
         config_kwargs = {"subfolder": "text_encoder"}
 
-        text_model = convert_open_clip_checkpoint(
-            checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
+        text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs)
+        tokenizer = CLIPTokenizer.from_pretrained(
+            "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only
         )
 
-        try:
-            tokenizer = CLIPTokenizer.from_pretrained(
-                "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only
-            )
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'stabilityai/stable-diffusion-2'."
-            )
-
         if stable_unclip is None:
             if controlnet:
                 pipe = pipeline_class(
@@ -1504,29 +1498,8 @@ def download_from_original_stable_diffusion_ckpt(
                     controlnet=controlnet,
                     safety_checker=None,
                     feature_extractor=None,
+                    requires_safety_checker=False,
                 )
-                if hasattr(pipe, "requires_safety_checker"):
-                    pipe.requires_safety_checker = False
-
-            elif pipeline_class == StableDiffusionUpscalePipeline:
-                scheduler = DDIMScheduler.from_pretrained(
-                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
-                )
-                low_res_scheduler = DDPMScheduler.from_pretrained(
-                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
-                )
-
-                pipe = pipeline_class(
-                    vae=vae,
-                    text_encoder=text_model,
-                    tokenizer=tokenizer,
-                    unet=unet,
-                    scheduler=scheduler,
-                    low_res_scheduler=low_res_scheduler,
-                    safety_checker=None,
-                    feature_extractor=None,
-                )
-
             else:
                 pipe = pipeline_class(
                     vae=vae,
@@ -1536,10 +1509,8 @@ def download_from_original_stable_diffusion_ckpt(
                     scheduler=scheduler,
                     safety_checker=None,
                     feature_extractor=None,
+                    requires_safety_checker=False,
                 )
-                if hasattr(pipe, "requires_safety_checker"):
-                    pipe.requires_safety_checker = False
-
         else:
             image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
                 original_config, clip_stats_path=clip_stats_path, device=device
@@ -1570,14 +1541,9 @@ def download_from_original_stable_diffusion_ckpt(
                         karlo_model, subfolder="prior", local_files_only=local_files_only
                     )
 
-                    try:
-                        prior_tokenizer = CLIPTokenizer.from_pretrained(
-                            "openai/clip-vit-large-patch14", local_files_only=local_files_only
-                        )
-                    except Exception:
-                        raise ValueError(
-                            f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
-                        )
+                    prior_tokenizer = CLIPTokenizer.from_pretrained(
+                        "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                    )
                     prior_text_model = CLIPTextModelWithProjection.from_pretrained(
                         "openai/clip-vit-large-patch14", local_files_only=local_files_only
                     )
@@ -1610,22 +1576,10 @@ def download_from_original_stable_diffusion_ckpt(
                 raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
     elif model_type == "PaintByExample":
         vision_model = convert_paint_by_example_checkpoint(checkpoint)
-        try:
-            tokenizer = CLIPTokenizer.from_pretrained(
-                "openai/clip-vit-large-patch14", local_files_only=local_files_only
-            )
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
-            )
-        try:
-            feature_extractor = AutoFeatureExtractor.from_pretrained(
-                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
-            )
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the feature_extractor in the following path: 'CompVis/stable-diffusion-safety-checker'."
-            )
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+        feature_extractor = AutoFeatureExtractor.from_pretrained(
+            "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+        )
         pipe = PaintByExamplePipeline(
             vae=vae,
             image_encoder=vision_model,
@@ -1638,16 +1592,11 @@ def download_from_original_stable_diffusion_ckpt(
         text_model = convert_ldm_clip_checkpoint(
             checkpoint, local_files_only=local_files_only, text_encoder=text_encoder
         )
-        try:
-            tokenizer = (
-                CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
-                if tokenizer is None
-                else tokenizer
-            )
-        except Exception:
-            raise ValueError(
-                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
-            )
+        tokenizer = (
+            CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+            if tokenizer is None
+            else tokenizer
+        )
 
         if load_safety_checker:
             safety_checker = StableDiffusionSafetyChecker.from_pretrained(
@@ -1683,33 +1632,18 @@ def download_from_original_stable_diffusion_ckpt(
             )
     elif model_type in ["SDXL", "SDXL-Refiner"]:
         if model_type == "SDXL":
-            try:
-                tokenizer = CLIPTokenizer.from_pretrained(
-                    "openai/clip-vit-large-patch14", local_files_only=local_files_only
-                )
-            except Exception:
-                raise ValueError(
-                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
-                )
+            tokenizer = CLIPTokenizer.from_pretrained(
+                "openai/clip-vit-large-patch14", local_files_only=local_files_only
+            )
             text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
-            try:
-                tokenizer_2 = CLIPTokenizer.from_pretrained(
-                    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
-                )
-            except Exception:
-                raise ValueError(
-                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'."
-                )
+            tokenizer_2 = CLIPTokenizer.from_pretrained(
+                "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
+            )
 
             config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
             config_kwargs = {"projection_dim": 1280}
             text_encoder_2 = convert_open_clip_checkpoint(
-                checkpoint,
-                config_name,
-                prefix="conditioner.embedders.1.model.",
-                has_projection=True,
-                local_files_only=local_files_only,
-                **config_kwargs,
+                checkpoint, config_name, prefix="conditioner.embedders.1.model.", has_projection=True, **config_kwargs
             )
 
             if is_accelerate_available():  # SBM Now move model to cpu.
@@ -1729,18 +1663,6 @@ def download_from_original_stable_diffusion_ckpt(
                     scheduler=scheduler,
                     force_zeros_for_empty_prompt=True,
                 )
-            elif adapter:
-                pipe = pipeline_class(
-                    vae=vae,
-                    text_encoder=text_encoder,
-                    tokenizer=tokenizer,
-                    text_encoder_2=text_encoder_2,
-                    tokenizer_2=tokenizer_2,
-                    unet=unet,
-                    adapter=adapter,
-                    scheduler=scheduler,
-                    force_zeros_for_empty_prompt=True,
-                )
             else:
                 pipe = pipeline_class(
                     vae=vae,
@@ -1755,23 +1677,14 @@ def download_from_original_stable_diffusion_ckpt(
         else:
             tokenizer = None
             text_encoder = None
-            try:
-                tokenizer_2 = CLIPTokenizer.from_pretrained(
-                    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
-                )
-            except Exception:
-                raise ValueError(
-                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'."
-                )
+            tokenizer_2 = CLIPTokenizer.from_pretrained(
+                "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
+            )
+
             config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
             config_kwargs = {"projection_dim": 1280}
             text_encoder_2 = convert_open_clip_checkpoint(
-                checkpoint,
-                config_name,
-                prefix="conditioner.embedders.0.model.",
-                has_projection=True,
-                local_files_only=local_files_only,
-                **config_kwargs,
+                checkpoint, config_name, prefix="conditioner.embedders.0.model.", has_projection=True, **config_kwargs
             )
 
             if is_accelerate_available():  # SBM Now move model to cpu.