From 88b77bc80e3c18a5928605ce35ebca62fd226b8e Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 3 Feb 2023 12:14:42 +0100
Subject: [PATCH 01/66] Add AudioLDM

---
 .../convert_original_audioldm_to_diffusers.py |  125 ++
 src/diffusers/__init__.py                     |    1 +
 src/diffusers/models/cross_attention.py       |    2 +-
 src/diffusers/models/resnet.py                |    2 +-
 src/diffusers/pipelines/__init__.py           |    1 +
 src/diffusers/pipelines/audioldm/__init__.py  |    1 +
 .../pipelines/audioldm/convert_from_ckpt.py   | 1032 +++++++++++++++++
 .../pipelines/audioldm/pipeline_audioldm.py   |  597 ++++++++++
 8 files changed, 1759 insertions(+), 2 deletions(-)
 create mode 100644 scripts/convert_original_audioldm_to_diffusers.py
 create mode 100644 src/diffusers/pipelines/audioldm/__init__.py
 create mode 100644 src/diffusers/pipelines/audioldm/convert_from_ckpt.py
 create mode 100644 src/diffusers/pipelines/audioldm/pipeline_audioldm.py

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
new file mode 100644
index 000000000000..e376faf2a159
--- /dev/null
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -0,0 +1,125 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LDM checkpoints. """
+
+import argparse
+
+from diffusers.pipelines.audioldm.convert_from_ckpt import load_pipeline_from_original_audioldm_ckpt
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--checkpoint_path",
+        default="/Users/sanchitgandhi/convert-audioldm/ldm_trimmed.ckpt",
+        type=str,
+        required=False,
+        help="Path to the checkpoint to convert.",
+    )
+    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--num_in_channels",
+        default=None,
+        type=int,
+        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="ddim",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
+    )
+    parser.add_argument(
+        "--pipeline_type",
+        default=None,
+        type=str,
+        help="The pipeline type. If `None` pipeline will be automatically inferred.",
+    )
+    parser.add_argument(
+        "--image_size",
+        default=None,
+        type=int,
+        help=(
+            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2"
+            " Base. Use 768 for Stable Diffusion v2."
+        ),
+    )
+    parser.add_argument(
+        "--prediction_type",
+        default=None,
+        type=str,
+        help=(
+            "The prediction type that the model was trained on. Use 'epsilon' for Stable Diffusion v1.X and Stable"
+            " Siffusion v2 Base. Use 'v-prediction' for Stable Diffusion v2."
+        ),
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument(
+        "--upcast_attention",
+        default=False,
+        type=bool,
+        help=(
+            "Whether the attention computation should always be upcasted. This is necessary when running stable"
+            " diffusion 2.1."
+        ),
+    )
+    parser.add_argument(
+        "--from_safetensors",
+        action="store_true",
+        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
+    )
+    parser.add_argument(
+        "--to_safetensors",
+        action="store_true",
+        help="Whether to store pipeline in safetensors format or not.",
+    )
+    parser.add_argument(
+        "--dump_path",
+        default="/Users/sanchitgandhi/convert-audioldm/diffusers_out",
+        type=str,
+        required=False,
+        help="Path to the output model.",
+    )
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+
+    pipe = load_pipeline_from_original_audioldm_ckpt(
+        checkpoint_path=args.checkpoint_path,
+        original_config_file=args.original_config_file,
+        image_size=args.image_size,
+        prediction_type=args.prediction_type,
+        model_type=args.pipeline_type,
+        extract_ema=args.extract_ema,
+        scheduler_type=args.scheduler_type,
+        num_in_channels=args.num_in_channels,
+        upcast_attention=args.upcast_attention,
+        from_safetensors=args.from_safetensors,
+    )
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index f9803380121a..5b54a280ec9d 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -106,6 +106,7 @@
     from .pipelines import (
         AltDiffusionImg2ImgPipeline,
         AltDiffusionPipeline,
+        AudioLDMPipeline,
         CycleDiffusionPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
diff --git a/src/diffusers/models/cross_attention.py b/src/diffusers/models/cross_attention.py
index 4cd912b80a73..8da05c1a82c1 100644
--- a/src/diffusers/models/cross_attention.py
+++ b/src/diffusers/models/cross_attention.py
@@ -62,7 +62,7 @@ def __init__(
     ):
         super().__init__()
         inner_dim = dim_head * heads
-        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        cross_attention_dim = query_dim
         self.upcast_attention = upcast_attention
         self.upcast_softmax = upcast_softmax
 
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 7037da5725cf..adbf5d35d827 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -412,7 +412,7 @@ def __init__(
             else:
                 raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
 
-            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
+            self.time_emb_proj = torch.nn.Linear(temb_channels * 2, time_emb_proj_out_channels)
         else:
             self.time_emb_proj = None
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b69363f545e5..7a926cc3e9b0 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -42,6 +42,7 @@
     from ..utils.dummy_torch_and_transformers_objects import *  # noqa F403
 else:
     from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
+    from .audioldm import AudioLDMPipeline
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .stable_diffusion import (
diff --git a/src/diffusers/pipelines/audioldm/__init__.py b/src/diffusers/pipelines/audioldm/__init__.py
new file mode 100644
index 000000000000..719bbbce847c
--- /dev/null
+++ b/src/diffusers/pipelines/audioldm/__init__.py
@@ -0,0 +1 @@
+from .pipeline_audioldm import AudioLDMPipeline
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
new file mode 100644
index 000000000000..4c79b98a7006
--- /dev/null
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -0,0 +1,1032 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Stable Diffusion checkpoints."""
+
+import os
+import re
+import tempfile
+
+import torch
+
+import requests
+from diffusers import (
+    AudioLDMPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LDMTextToImagePipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from transformers import AutoFeatureExtractor, BertTokenizerFast, CLAPAudioConfig, CLAPTextModel, CLAPTokenizer
+
+from ...utils import is_omegaconf_available, is_safetensors_available
+from ...utils.import_utils import BACKENDS_MAPPING
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    unet_params = original_config.model.params.unet_config.params
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+
+    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+
+    config = dict(
+        sample_size=image_size // vae_scale_factor,
+        in_channels=unet_params.in_channels,
+        out_channels=unet_params.out_channels,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=unet_params.num_res_blocks,
+        # cross_attention_dim=unet_params.context_dim,
+        # attention_head_dim=head_dim,
+        use_linear_projection=use_linear_projection,
+    )
+
+    return config
+
+
+def create_vae_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    _ = original_config.model.params.first_stage_config.params.embed_dim
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = dict(
+        sample_size=image_size,
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+    )
+    return config
+
+
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def create_ldm_bert_config(original_config):
+    bert_params = original_config.model.parms.cond_stage_config.params
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+def convert_ldm_clip_checkpoint(checkpoint):
+    text_model = CLAPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+textenc_conversion_lst = [
+    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+
+def convert_paint_by_example_checkpoint(checkpoint):
+    config = CLAPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
+    model = PaintByExampleImageEncoder(config)
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    # load clip vision
+    model.model.load_state_dict(text_model_dict)
+
+    # load mapper
+    keys_mapper = {
+        k[len("cond_stage_model.mapper.res") :]: v
+        for k, v in checkpoint.items()
+        if k.startswith("cond_stage_model.mapper")
+    }
+
+    MAPPING = {
+        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
+        "attn.c_proj": ["attn1.to_out.0"],
+        "ln_1": ["norm1"],
+        "ln_2": ["norm3"],
+        "mlp.c_fc": ["ff.net.0.proj"],
+        "mlp.c_proj": ["ff.net.2"],
+    }
+
+    mapped_weights = {}
+    for key, value in keys_mapper.items():
+        prefix = key[: len("blocks.i")]
+        suffix = key.split(prefix)[-1].split(".")[-1]
+        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
+        mapped_names = MAPPING[name]
+
+        num_splits = len(mapped_names)
+        for i, mapped_name in enumerate(mapped_names):
+            new_name = ".".join([prefix, mapped_name, suffix])
+            shape = value.shape[0] // num_splits
+            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
+
+    model.mapper.load_state_dict(mapped_weights)
+
+    # load final layer norm
+    model.final_layer_norm.load_state_dict(
+        {
+            "bias": checkpoint["cond_stage_model.final_ln.bias"],
+            "weight": checkpoint["cond_stage_model.final_ln.weight"],
+        }
+    )
+
+    # load final proj
+    model.proj_out.load_state_dict(
+        {
+            "bias": checkpoint["proj_out.bias"],
+            "weight": checkpoint["proj_out.weight"],
+        }
+    )
+
+    # load uncond vector
+    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
+    return model
+
+
+def convert_laion_clap_checkpoint(checkpoint):
+    text_model = CLAPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
+
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
+
+    for key in keys:
+        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
+            continue
+        if key in textenc_conversion_map:
+            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
+        if key.startswith("cond_stage_model.model.transformer."):
+            new_key = key[len("cond_stage_model.model.transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+
+                text_model_dict[new_key] = checkpoint[key]
+
+    text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+def load_pipeline_from_original_audioldm_ckpt(
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 512,
+    prediction_type: str = None,
+    model_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "pndm",
+    num_in_channels: int = None,
+    upcast_attention: bool = None,
+    device: str = None,
+    from_safetensors: bool = False,
+) -> AudioLDMPipeline:
+    """
+    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
+    config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    :param checkpoint_path: Path to `.ckpt` file. :param original_config_file: Path to `.yaml` config file
+    corresponding to the original architecture. If `None`, will be
+            automatically inferred by looking for a key that only exists in SD2.0 models.
+    :param image_size: The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable
+    Siffusion v2
+            Base. Use 768 for Stable Diffusion v2.
+    :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion
+    v1.X and Stable
+            Siffusion v2 Base. Use `'v-prediction'` for Stable Diffusion v2.
+    :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
+    inferred. :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
+    "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of
+    `["FrozenOpenCLAPEmbedder", "FrozenCLAPEmbedder", "PaintByExample"]`. :param extract_ema: Only relevant for
+    checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights
+            or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher
+            quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
+    :param upcast_attention: Whether the attention computation should always be upcasted. This is necessary when
+    running
+                    stable diffusion 2.1.
+    :param device: The device to use. Pass `None` to determine automatically. :param from_safetensors: If
+    `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch. :return: A
+    StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+
+    if not is_omegaconf_available():
+        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
+
+    from omegaconf import OmegaConf
+
+    # TODO: remove this func for final PR
+    # Copied from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/audioldm/utils.py#L72-L73
+    def default_audioldm_config():
+        return OmegaConf.create(
+            {
+                "wave_file_save_path": "./output",
+                "id": {
+                    "version": "v1",
+                    "name": "default",
+                    "root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
+                },
+                "model": {
+                    "device": "cuda",
+                    "reload_from_ckpt": "/mnt/fast/nobackup/scratch4weeks/hl01486/exps/audio_generation/stablediffusion/LDM/audioverse/2023_01_14_full_F4_B_spatial_v2_v1/checkpoints/last.ckpt",
+                    "target": "audioldm.pipline.LatentDiffusion",
+                    "params": {
+                        "base_learning_rate": 5e-06,
+                        "linear_start": 0.0015,
+                        "linear_end": 0.0195,
+                        "num_timesteps_cond": 1,
+                        "log_every_t": 200,
+                        "timesteps": 1000,
+                        "first_stage_key": "fbank",
+                        "cond_stage_key": "waveform",
+                        "latent_t_size": 256,
+                        "latent_f_size": 16,
+                        "channels": 8,
+                        "cond_stage_trainable": True,
+                        "conditioning_key": "film",
+                        "monitor": "val/loss_simple_ema",
+                        "scale_by_std": True,
+                        "unet_config": {
+                            "target": "audioldm.latent_diffusion.openaimodel.UNetModel",
+                            "params": {
+                                "image_size": 64,
+                                "extra_film_condition_dim": 512,
+                                "extra_film_use_concat": True,
+                                "in_channels": 8,
+                                "out_channels": 8,
+                                "model_channels": 128,
+                                "attention_resolutions": [8, 4, 2],
+                                "num_res_blocks": 2,
+                                "channel_mult": [1, 2, 3, 5],
+                                "num_head_channels": 32,
+                                "use_spatial_transformer": True,
+                            },
+                        },
+                        "first_stage_config": {
+                            "base_learning_rate": 4.5e-05,
+                            "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
+                            "params": {
+                                "monitor": "val/rec_loss",
+                                "image_key": "fbank",
+                                "subband": 1,
+                                "embed_dim": 8,
+                                "time_shuffle": 1,
+                                "ddconfig": {
+                                    "double_z": True,
+                                    "z_channels": 8,
+                                    "resolution": 256,
+                                    "downsample_time": False,
+                                    "in_channels": 1,
+                                    "out_ch": 1,
+                                    "ch": 128,
+                                    "ch_mult": [1, 2, 4],
+                                    "num_res_blocks": 2,
+                                    "attn_resolutions": [],
+                                    "dropout": 0.0,
+                                },
+                            },
+                        },
+                        "cond_stage_config": {
+                            "target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2",
+                            "params": {
+                                "key": "waveform",
+                                "sampling_rate": 16000,
+                                "embed_mode": "audio",
+                                "unconditional_prob": 0.1,
+                            },
+                        },
+                    },
+                },
+            }
+        )
+
+    if from_safetensors:
+        if not is_safetensors_available():
+            raise ValueError(BACKENDS_MAPPING["safetensors"][1])
+
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    # Sometimes models don't have the global_step item
+    if "global_step" in checkpoint:
+        global_step = checkpoint["global_step"]
+    else:
+        print("global_step key not found in model")
+        global_step = None
+
+    if "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    if original_config_file is None:
+        original_config = default_audioldm_config()
+    else:
+        original_config = OmegaConf.load(original_config_file)
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+            # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+        if image_size is None:
+            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+            # as it relies on a brittle global step parameter here
+            image_size = 512 if global_step == 875000 else 768
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+        if image_size is None:
+            image_size = 512
+
+    num_train_timesteps = original_config.model.params.timesteps
+    beta_start = original_config.model.params.linear_start
+    beta_end = original_config.model.params.linear_end
+
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
+    )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["upcast_attention"] = upcast_attention
+    unet = UNet2DConditionModel(**unet_config)
+
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model.
+    if model_type is None:
+        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+
+    if model_type == "CLAPAudioEmbeddingClassifierFreev2":
+        # TODO: Load CLAP tokenizer + model
+        """text_model = convert_laion_clap_checkpoint(checkpoint)
+        tokenizer = CLAPTokenizer.from_pretrained() pipe = AudioLDMPipeline(
+            vae=vae, text_encoder=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler,
+        )
+
+    return pipe"""
\ No newline at end of file
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
new file mode 100644
index 000000000000..53853283c61c
--- /dev/null
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -0,0 +1,597 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+
+from packaging import version
+from transformers import CLAPTextModel, CLAPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import deprecate, is_accelerate_available, logging, randn_tensor, replace_example_docstring
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# TODO: update doc string for AudioLDM
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+class AudioLDMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-audio generation using AudioLDM.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLAPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLAP](https://huggingface.co/docs/transformers/model_doc/clap#transformers.CLAPTextModel), specifically
+            the (TODO) variant.
+        tokenizer (`CLAPTokenizer`):
+            Tokenizer of class
+            [CLAPTokenizer](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/clap#transformers.CLAPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLAPTextModel,
+        tokenizer: CLAPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config()
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding.
+
+        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+        steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_ prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        # 8. Post-processing
+        audio = self.decode_latents(latents)
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)

From 9b353b01ab5e2ecd5f410fdb4267310be4979c6a Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 3 Feb 2023 16:29:03 +0100
Subject: [PATCH 02/66] up

---
 .../convert_original_audioldm_to_diffusers.py |  4 ++--
 src/diffusers/models/attention.py             |  2 +-
 src/diffusers/models/cross_attention.py       |  2 +-
 src/diffusers/models/resnet.py                |  2 +-
 src/diffusers/models/unet_2d_condition.py     |  4 ++++
 .../pipelines/audioldm/convert_from_ckpt.py   |  2 +-
 .../pipelines/audioldm/pipeline_audioldm.py   | 24 +++++++++++++------
 7 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index e376faf2a159..88a55393b766 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -26,7 +26,7 @@
         "--checkpoint_path",
         default="/Users/sanchitgandhi/convert-audioldm/ldm_trimmed.ckpt",
         type=str,
-        required=False,
+        required=False, # TODO: revert to True
         help="Path to the checkpoint to convert.",
     )
     # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
@@ -104,7 +104,7 @@
         "--dump_path",
         default="/Users/sanchitgandhi/convert-audioldm/diffusers_out",
         type=str,
-        required=False,
+        required=False,  # TODO: revert to True
         help="Path to the output model.",
     )
     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index b5acd6f4f900..d1d58887c2aa 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -236,7 +236,7 @@ def __init__(
         if cross_attention_dim is not None:
             self.attn2 = CrossAttention(
                 query_dim=dim,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
                 heads=num_attention_heads,
                 dim_head=attention_head_dim,
                 dropout=dropout,
diff --git a/src/diffusers/models/cross_attention.py b/src/diffusers/models/cross_attention.py
index 8da05c1a82c1..4cd912b80a73 100644
--- a/src/diffusers/models/cross_attention.py
+++ b/src/diffusers/models/cross_attention.py
@@ -62,7 +62,7 @@ def __init__(
     ):
         super().__init__()
         inner_dim = dim_head * heads
-        cross_attention_dim = query_dim
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
         self.upcast_attention = upcast_attention
         self.upcast_softmax = upcast_softmax
 
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index adbf5d35d827..7037da5725cf 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -412,7 +412,7 @@ def __init__(
             else:
                 raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
 
-            self.time_emb_proj = torch.nn.Linear(temb_channels * 2, time_emb_proj_out_channels)
+            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
         else:
             self.time_emb_proj = None
 
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index c524dbf2bed3..ea2f5b8c82bd 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -152,6 +152,10 @@ def __init__(
         else:
             self.class_embedding = None
 
+        # TODO: add as an arg
+        if True:
+            time_embed_dim = time_embed_dim * 2
+
         self.down_blocks = nn.ModuleList([])
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index 4c79b98a7006..dc862a36de1a 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -247,7 +247,7 @@ def create_unet_diffusers_config(original_config, image_size: int):
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
-        # cross_attention_dim=unet_params.context_dim,
+        cross_attention_dim=True,
         # attention_head_dim=head_dim,
         use_linear_projection=use_linear_projection,
     )
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 53853283c61c..a6e7d40bca5a 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -18,7 +18,7 @@
 import torch
 
 from packaging import version
-from transformers import CLAPTextModel, CLAPTokenizer
+from transformers import CLAPTextModel, CLAPTokenizer #, HifiGanVocoder
 
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -66,6 +66,8 @@ class AudioLDMPipeline(DiffusionPipeline):
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        vocoder (`HifiGanVocoder`):
+            Vocoder of class [HifiGanVocoder](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/speecht5#transformers.HifiGanVocoder).
     """
     _optional_components = ["safety_checker", "feature_extractor"]
 
@@ -76,6 +78,7 @@ def __init__(
         tokenizer: CLAPTokenizer,
         unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
+        #vocoder: HifiGanVocoder,
     ):
         super().__init__()
 
@@ -133,6 +136,7 @@ def __init__(
             tokenizer=tokenizer,
             unet=unet,
             scheduler=scheduler,
+            #vocoder=vocoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config()
@@ -327,11 +331,15 @@ def _encode_prompt(
 
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents).sample
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
+        mel_spectrogram = self.vae.decode(latents).sample
+        return mel_spectrogram
+
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        mel_spectrogram = mel_spectrogram.permute(0, 2, 1)
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().detach().numpy()
+        return waveform
 
     def prepare_extra_step_kwargs(self, generator, eta):
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -589,7 +597,9 @@ def __call__(
                         callback(i, t, latents)
 
         # 8. Post-processing
-        audio = self.decode_latents(latents)
+        mel_spectrogram = self.decode_latents(latents)
+
+        audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
 
         if not return_dict:
             return (audio,)

From 1a3ea27d45ca95fd25e3042683be038b8e2cb08d Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 9 Feb 2023 15:10:58 +0100
Subject: [PATCH 03/66] add vocoder

---
 src/diffusers/models/unet_2d_condition.py     |   4 -
 .../pipelines/audioldm/convert_from_ckpt.py   | 107 +++---------------
 .../pipelines/audioldm/pipeline_audioldm.py   |  19 ++--
 3 files changed, 25 insertions(+), 105 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index ea2f5b8c82bd..c524dbf2bed3 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -152,10 +152,6 @@ def __init__(
         else:
             self.class_embedding = None
 
-        # TODO: add as an arg
-        if True:
-            time_embed_dim = time_embed_dim * 2
-
         self.down_blocks = nn.ModuleList([])
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index dc862a36de1a..907cd18a4da6 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -36,8 +36,8 @@
 )
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from transformers import AutoFeatureExtractor, BertTokenizerFast, CLAPAudioConfig, CLAPTextModel, CLAPTokenizer
+from transformers import AutoFeatureExtractor, CLAPAudioConfig, CLAPTextModel, AutoTokenizer, SpeechT5HifiGan, \
+    CLAPTextConfig
 
 from ...utils import is_omegaconf_available, is_safetensors_available
 from ...utils.import_utils import BACKENDS_MAPPING
@@ -628,23 +628,6 @@ def _copy_layers(hf_layers, pt_layers):
 
     return hf_model
 
-
-def convert_ldm_clip_checkpoint(checkpoint):
-    text_model = CLAPTextModel.from_pretrained("openai/clip-vit-large-patch14")
-
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    for key in keys:
-        if key.startswith("cond_stage_model.transformer"):
-            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
-
-    text_model.load_state_dict(text_model_dict)
-
-    return text_model
-
-
 textenc_conversion_lst = [
     ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
     ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
@@ -669,73 +652,6 @@ def convert_ldm_clip_checkpoint(checkpoint):
 textenc_pattern = re.compile("|".join(protected.keys()))
 
 
-def convert_paint_by_example_checkpoint(checkpoint):
-    config = CLAPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
-    model = PaintByExampleImageEncoder(config)
-
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    for key in keys:
-        if key.startswith("cond_stage_model.transformer"):
-            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
-
-    # load clip vision
-    model.model.load_state_dict(text_model_dict)
-
-    # load mapper
-    keys_mapper = {
-        k[len("cond_stage_model.mapper.res") :]: v
-        for k, v in checkpoint.items()
-        if k.startswith("cond_stage_model.mapper")
-    }
-
-    MAPPING = {
-        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
-        "attn.c_proj": ["attn1.to_out.0"],
-        "ln_1": ["norm1"],
-        "ln_2": ["norm3"],
-        "mlp.c_fc": ["ff.net.0.proj"],
-        "mlp.c_proj": ["ff.net.2"],
-    }
-
-    mapped_weights = {}
-    for key, value in keys_mapper.items():
-        prefix = key[: len("blocks.i")]
-        suffix = key.split(prefix)[-1].split(".")[-1]
-        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
-        mapped_names = MAPPING[name]
-
-        num_splits = len(mapped_names)
-        for i, mapped_name in enumerate(mapped_names):
-            new_name = ".".join([prefix, mapped_name, suffix])
-            shape = value.shape[0] // num_splits
-            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
-
-    model.mapper.load_state_dict(mapped_weights)
-
-    # load final layer norm
-    model.final_layer_norm.load_state_dict(
-        {
-            "bias": checkpoint["cond_stage_model.final_ln.bias"],
-            "weight": checkpoint["cond_stage_model.final_ln.weight"],
-        }
-    )
-
-    # load final proj
-    model.proj_out.load_state_dict(
-        {
-            "bias": checkpoint["proj_out.bias"],
-            "weight": checkpoint["proj_out.weight"],
-        }
-    )
-
-    # load uncond vector
-    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
-    return model
-
-
 def convert_laion_clap_checkpoint(checkpoint):
     text_model = CLAPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
 
@@ -779,7 +695,7 @@ def convert_laion_clap_checkpoint(checkpoint):
 def load_pipeline_from_original_audioldm_ckpt(
     checkpoint_path: str,
     original_config_file: str = None,
-    image_size: int = 512,
+    image_size: int = 1024,
     prediction_type: str = None,
     model_type: str = None,
     extract_ema: bool = False,
@@ -1024,9 +940,16 @@ def default_audioldm_config():
 
     if model_type == "CLAPAudioEmbeddingClassifierFreev2":
         # TODO: Load CLAP tokenizer + model
-        """text_model = convert_laion_clap_checkpoint(checkpoint)
-        tokenizer = CLAPTokenizer.from_pretrained() pipe = AudioLDMPipeline(
-            vae=vae, text_encoder=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler,
-        )
+        #text_model = CLAPTextModel.from_pretrained("laion-ai/clap-htsat-unfused")
+        #tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
+
+        config = CLAPTextConfig()
+        text_model = CLAPTextModel(config)
+        tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+
+        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+        pipe = AudioLDMPipeline(
+                vae=vae, text_encoder=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler, vocoder=vocoder,
+            )
 
-    return pipe"""
\ No newline at end of file
+    return pipe
\ No newline at end of file
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index a6e7d40bca5a..9a93bacff877 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -18,7 +18,7 @@
 import torch
 
 from packaging import version
-from transformers import CLAPTextModel, CLAPTokenizer #, HifiGanVocoder
+from transformers import CLAPTextModel, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
 
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -56,18 +56,18 @@ class AudioLDMPipeline(DiffusionPipeline):
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
         text_encoder ([`CLAPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
+            Frozen text-encoder. AudioLDM uses the text portion of
             [CLAP](https://huggingface.co/docs/transformers/model_doc/clap#transformers.CLAPTextModel), specifically
             the (TODO) variant.
-        tokenizer (`CLAPTokenizer`):
+        tokenizer ([`PreTrainedTokenizer`]):
             Tokenizer of class
-            [CLAPTokenizer](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/clap#transformers.CLAPTokenizer).
+            [RobertaTokenizer](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/clap#transformers.RobertaTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        vocoder (`HifiGanVocoder`):
-            Vocoder of class [HifiGanVocoder](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/speecht5#transformers.HifiGanVocoder).
+        vocoder (`SpeechT5HifiGan`):
+            Vocoder of class [SpeechT5HifiGan](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/speecht5#transformers.SpeechT5HifiGan).
     """
     _optional_components = ["safety_checker", "feature_extractor"]
 
@@ -75,10 +75,10 @@ def __init__(
         self,
         vae: AutoencoderKL,
         text_encoder: CLAPTextModel,
-        tokenizer: CLAPTokenizer,
+        tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
         unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
-        #vocoder: HifiGanVocoder,
+        vocoder: SpeechT5HifiGan,
     ):
         super().__init__()
 
@@ -136,7 +136,7 @@ def __init__(
             tokenizer=tokenizer,
             unet=unet,
             scheduler=scheduler,
-            #vocoder=vocoder,
+            vocoder=vocoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config()
@@ -552,6 +552,7 @@ def __call__(
 
         # 5. Prepare latent variables
         num_channels_latents = self.unet.in_channels
+        import ipdb; ipdb.set_trace()
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,

From 1023f68b93dcc1cfd51898476f51ba761316c20a Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 14 Feb 2023 13:49:34 +0100
Subject: [PATCH 04/66] start unet

---
 .../convert_original_audioldm_to_diffusers.py |  2 +-
 src/diffusers/models/unet_2d_condition.py     | 19 ++++++
 .../pipelines/audioldm/convert_from_ckpt.py   | 64 +++++++++----------
 .../pipelines/audioldm/pipeline_audioldm.py   | 36 ++++++-----
 4 files changed, 72 insertions(+), 49 deletions(-)

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 88a55393b766..7e6ea25bd530 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -26,7 +26,7 @@
         "--checkpoint_path",
         default="/Users/sanchitgandhi/convert-audioldm/ldm_trimmed.ckpt",
         type=str,
-        required=False, # TODO: revert to True
+        required=False,  # TODO: revert to True
         help="Path to the checkpoint to convert.",
     )
     # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index c524dbf2bed3..9180d30b9284 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -127,6 +127,8 @@ def __init__(
         num_class_embeds: Optional[int] = None,
         upcast_attention: bool = False,
         resnet_time_scale_shift: str = "default",
+        extra_film_condition_dim: int = None,
+        extra_film_use_concat: bool = False,
     ):
         super().__init__()
 
@@ -152,6 +154,18 @@ def __init__(
         else:
             self.class_embedding = None
 
+        # film condition
+        if self.class_embedding is not None and extra_film_condition_dim is not None:
+            raise ValueError("You cannot set both `class_embed_type` and `extra_film_use_concat`.")
+        self.use_extra_film_by_concat = extra_film_condition_dim is not None and extra_film_use_concat
+        self.use_extra_film_by_addition = extra_film_condition_dim is not None and not extra_film_use_concat
+
+        if extra_film_condition_dim is not None:
+            self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
+
+        if self.use_extra_film_by_concat:
+            time_embed_dim = time_embed_dim * 2
+
         self.down_blocks = nn.ModuleList([])
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
@@ -478,6 +492,11 @@ def forward(
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
             emb = emb + class_emb
 
+        if self.use_extra_film_by_addition:
+            emb = emb + self.film_embedding(class_labels)
+        elif self.use_extra_film_by_concat:
+            emb = torch.cat([emb, self.film_embedding(class_labels)], dim=-1)
+
         # 2. pre-process
         sample = self.conv_in(sample)
 
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index 907cd18a4da6..6fd90b838c7c 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -14,13 +14,10 @@
 # limitations under the License.
 """ Conversion script for the Stable Diffusion checkpoints."""
 
-import os
 import re
-import tempfile
 
 import torch
 
-import requests
 from diffusers import (
     AudioLDMPipeline,
     AutoencoderKL,
@@ -29,15 +26,17 @@
     EulerAncestralDiscreteScheduler,
     EulerDiscreteScheduler,
     HeunDiscreteScheduler,
-    LDMTextToImagePipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
     UNet2DConditionModel,
 )
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
-from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
-from transformers import AutoFeatureExtractor, CLAPAudioConfig, CLAPTextModel, AutoTokenizer, SpeechT5HifiGan, \
-    CLAPTextConfig
+from transformers import (
+    AutoTokenizer,
+    ClapTextConfig,
+    ClapTextModelWithProjection,
+    SpeechT5HifiGan,
+)
 
 from ...utils import is_omegaconf_available, is_safetensors_available
 from ...utils.import_utils import BACKENDS_MAPPING
@@ -230,14 +229,10 @@ def create_unet_diffusers_config(original_config, image_size: int):
 
     vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
 
-    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
-    use_linear_projection = (
-        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+    extra_film_condition_dim = (
+        unet_params.extra_film_condition_dim if "extra_film_condition_dim" in unet_params else None
     )
-    if use_linear_projection:
-        # stable diffusion 2-base-512 and 2-768
-        if head_dim is None:
-            head_dim = [5, 10, 20, 20]
+    extra_film_use_concat = unet_params.extra_film_use_concat if "extra_film_use_concat" in unet_params else None
 
     config = dict(
         sample_size=image_size // vae_scale_factor,
@@ -248,8 +243,8 @@ def create_unet_diffusers_config(original_config, image_size: int):
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
         cross_attention_dim=True,
-        # attention_head_dim=head_dim,
-        use_linear_projection=use_linear_projection,
+        extra_film_condition_dim=extra_film_condition_dim,
+        extra_film_use_concat=extra_film_use_concat,
     )
 
     return config
@@ -338,6 +333,9 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
     new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
 
+    new_checkpoint["film_embedding.weight"] = unet_state_dict["film_emb.weight"]
+    new_checkpoint["film_embedding.bias"] = unet_state_dict["film_emb.bias"]
+
     new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
 
@@ -628,6 +626,7 @@ def _copy_layers(hf_layers, pt_layers):
 
     return hf_model
 
+
 textenc_conversion_lst = [
     ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
     ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
@@ -653,7 +652,7 @@ def _copy_layers(hf_layers, pt_layers):
 
 
 def convert_laion_clap_checkpoint(checkpoint):
-    text_model = CLAPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
+    text_model = ClapTextModelWithProjection.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
 
     keys = list(checkpoint.keys())
 
@@ -725,7 +724,7 @@ def load_pipeline_from_original_audioldm_ckpt(
     :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
     inferred. :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
     "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of
-    `["FrozenOpenCLAPEmbedder", "FrozenCLAPEmbedder", "PaintByExample"]`. :param extract_ema: Only relevant for
+    `["FrozenOpenClapEmbedder", "FrozenCLAPEmbedder", "PaintByExample"]`. :param extract_ema: Only relevant for
     checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights
             or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher
             quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
@@ -867,18 +866,14 @@ def default_audioldm_config():
         and original_config["model"]["params"]["parameterization"] == "v"
     ):
         if prediction_type is None:
-            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
-            # as it relies on a brittle global step parameter here
-            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+            prediction_type = "v_prediction"
         if image_size is None:
-            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
-            # as it relies on a brittle global step parameter here
-            image_size = 512 if global_step == 875000 else 768
+            image_size = 1024
     else:
         if prediction_type is None:
             prediction_type = "epsilon"
         if image_size is None:
-            image_size = 512
+            image_size = 1024
 
     num_train_timesteps = original_config.model.params.timesteps
     beta_start = original_config.model.params.linear_start
@@ -940,16 +935,21 @@ def default_audioldm_config():
 
     if model_type == "CLAPAudioEmbeddingClassifierFreev2":
         # TODO: Load CLAP tokenizer + model
-        #text_model = CLAPTextModel.from_pretrained("laion-ai/clap-htsat-unfused")
-        #tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
+        # text_model = ClapTextModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
+        # tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
 
-        config = CLAPTextConfig()
-        text_model = CLAPTextModel(config)
+        config = ClapTextConfig(projection_dim=512, projection_hidden_act="relu")
+        text_model = ClapTextModelWithProjection(config)
         tokenizer = AutoTokenizer.from_pretrained("roberta-base")
 
         vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
         pipe = AudioLDMPipeline(
-                vae=vae, text_encoder=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler, vocoder=vocoder,
-            )
+            vae=vae,
+            text_encoder=text_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            vocoder=vocoder,
+        )
 
-    return pipe
\ No newline at end of file
+    return pipe
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 9a93bacff877..2edf7142c84c 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -18,7 +18,7 @@
 import torch
 
 from packaging import version
-from transformers import CLAPTextModel, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
+from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
 
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
@@ -55,10 +55,10 @@ class AudioLDMPipeline(DiffusionPipeline):
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLAPTextModel`]):
+        text_encoder ([`ClapTextModelWithProjection`]):
             Frozen text-encoder. AudioLDM uses the text portion of
-            [CLAP](https://huggingface.co/docs/transformers/model_doc/clap#transformers.CLAPTextModel), specifically
-            the (TODO) variant.
+            [CLAP](https://huggingface.co/docs/transformers/model_doc/clap#transformers.ClapTextModelWithProjection),
+            specifically the (TODO) variant.
         tokenizer ([`PreTrainedTokenizer`]):
             Tokenizer of class
             [RobertaTokenizer](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/clap#transformers.RobertaTokenizer).
@@ -67,14 +67,15 @@ class AudioLDMPipeline(DiffusionPipeline):
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         vocoder (`SpeechT5HifiGan`):
-            Vocoder of class [SpeechT5HifiGan](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/speecht5#transformers.SpeechT5HifiGan).
+            Vocoder of class
+            [SpeechT5HifiGan](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/speecht5#transformers.SpeechT5HifiGan).
     """
     _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
         self,
         vae: AutoencoderKL,
-        text_encoder: CLAPTextModel,
+        text_encoder: ClapTextModelWithProjection,
         tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
         unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
@@ -250,7 +251,7 @@ def _encode_prompt(
                     untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
                 )
                 logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    "The following part of your input was truncated because CLAP can only handle sequences up to"
                     f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                 )
 
@@ -263,7 +264,8 @@ def _encode_prompt(
                 text_input_ids.to(device),
                 attention_mask=attention_mask,
             )
-            prompt_embeds = prompt_embeds[0]
+            text_embeds = prompt_embeds.text_embeds
+            prompt_embeds = prompt_embeds.last_hidden_state
 
         prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
 
@@ -311,7 +313,7 @@ def _encode_prompt(
                 uncond_input.input_ids.to(device),
                 attention_mask=attention_mask,
             )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+            negative_prompt_embeds = negative_prompt_embeds.last_hidden_state
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
@@ -327,7 +329,7 @@ def _encode_prompt(
             # to avoid doing two forward passes
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
-        return prompt_embeds
+        return prompt_embeds, text_embeds
 
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
@@ -429,8 +431,8 @@ def __call__(
         prompt: Union[str, List[str]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
+        num_inference_steps: int = 200,
+        guidance_scale: float = 2.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
@@ -454,10 +456,10 @@ def __call__(
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
-            num_inference_steps (`int`, *optional*, defaults to 50):
+            num_inference_steps (`int`, *optional*, defaults to 200):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
+            guidance_scale (`float`, *optional*, defaults to 2.5):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -536,7 +538,7 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
+        prompt_embeds, text_embeds = self._encode_prompt(
             prompt,
             device,
             num_images_per_prompt,
@@ -552,7 +554,6 @@ def __call__(
 
         # 5. Prepare latent variables
         num_channels_latents = self.unet.in_channels
-        import ipdb; ipdb.set_trace()
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,
@@ -575,11 +576,14 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
+                text_embeds_input = torch.cat([text_embeds] * 2) if do_classifier_free_guidance else latents
+
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
+                    class_labels=text_embeds_input,
                     cross_attention_kwargs=cross_attention_kwargs,
                 ).sample
 

From 81bff9925a1112d431cb1b2a46227a28239a41bb Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 14 Feb 2023 15:09:46 +0100
Subject: [PATCH 05/66] unconditional unet

---
 .../convert_original_audioldm_to_diffusers.py | 11 +---
 src/diffusers/models/unet_2d.py               | 60 +++++++++++++++----
 src/diffusers/models/unet_2d_condition.py     | 19 ------
 .../pipelines/audioldm/convert_from_ckpt.py   | 38 ++----------
 .../pipelines/audioldm/pipeline_audioldm.py   | 46 ++++++--------
 5 files changed, 73 insertions(+), 101 deletions(-)

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 7e6ea25bd530..2c2d6ddb4d63 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -81,15 +81,6 @@
             " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
         ),
     )
-    parser.add_argument(
-        "--upcast_attention",
-        default=False,
-        type=bool,
-        help=(
-            "Whether the attention computation should always be upcasted. This is necessary when running stable"
-            " diffusion 2.1."
-        ),
-    )
     parser.add_argument(
         "--from_safetensors",
         action="store_true",
@@ -119,7 +110,7 @@
         extract_ema=args.extract_ema,
         scheduler_type=args.scheduler_type,
         num_in_channels=args.num_in_channels,
-        upcast_attention=args.upcast_attention,
         from_safetensors=args.from_safetensors,
+        device=args.device,
     )
     pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 35f5dc34574c..da1212fad5f5 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -21,7 +21,7 @@
 from ..utils import BaseOutput
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
-from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block, UNetMidBlock2DCrossAttn
 
 
 @dataclass
@@ -101,6 +101,9 @@ def __init__(
         add_attention: bool = True,
         class_embed_type: Optional[str] = None,
         num_class_embeds: Optional[int] = None,
+        extra_film_condition_dim: int = None,
+        extra_film_use_concat: bool = False,
+        cross_attention_dim: int = None,
     ):
         super().__init__()
 
@@ -130,6 +133,18 @@ def __init__(
         else:
             self.class_embedding = None
 
+        # film condition
+        if self.class_embedding is not None and extra_film_condition_dim is not None:
+            raise ValueError("You cannot set both `class_embed_type` and `extra_film_use_concat`.")
+        self.use_extra_film_by_concat = extra_film_condition_dim is not None and extra_film_use_concat
+        self.use_extra_film_by_addition = extra_film_condition_dim is not None and not extra_film_use_concat
+
+        if extra_film_condition_dim is not None:
+            self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
+
+        if self.use_extra_film_by_concat:
+            time_embed_dim = time_embed_dim * 2
+
         self.down_blocks = nn.ModuleList([])
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
@@ -154,21 +169,36 @@ def __init__(
                 attn_num_head_channels=attention_head_dim,
                 downsample_padding=downsample_padding,
                 resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
             )
             self.down_blocks.append(down_block)
 
         # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            attn_num_head_channels=attention_head_dim,
-            resnet_groups=norm_num_groups,
-            add_attention=add_attention,
-        )
+        if cross_attention_dim is not None:
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim,
+                resnet_groups=norm_num_groups,
+            )
+        else:
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attn_num_head_channels=attention_head_dim,
+                resnet_groups=norm_num_groups,
+                add_attention=add_attention,
+            )
+
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
@@ -193,6 +223,7 @@ def __init__(
                 resnet_groups=norm_num_groups,
                 attn_num_head_channels=attention_head_dim,
                 resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -255,6 +286,11 @@ def forward(
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
             emb = emb + class_emb
 
+        if self.use_extra_film_by_addition:
+            emb = emb + self.film_embedding(class_labels)
+        elif self.use_extra_film_by_concat:
+            emb = torch.cat([emb, self.film_embedding(class_labels)], dim=-1)
+
         # 2. pre-process
         skip_sample = sample
         sample = self.conv_in(sample)
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 9180d30b9284..c524dbf2bed3 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -127,8 +127,6 @@ def __init__(
         num_class_embeds: Optional[int] = None,
         upcast_attention: bool = False,
         resnet_time_scale_shift: str = "default",
-        extra_film_condition_dim: int = None,
-        extra_film_use_concat: bool = False,
     ):
         super().__init__()
 
@@ -154,18 +152,6 @@ def __init__(
         else:
             self.class_embedding = None
 
-        # film condition
-        if self.class_embedding is not None and extra_film_condition_dim is not None:
-            raise ValueError("You cannot set both `class_embed_type` and `extra_film_use_concat`.")
-        self.use_extra_film_by_concat = extra_film_condition_dim is not None and extra_film_use_concat
-        self.use_extra_film_by_addition = extra_film_condition_dim is not None and not extra_film_use_concat
-
-        if extra_film_condition_dim is not None:
-            self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
-
-        if self.use_extra_film_by_concat:
-            time_embed_dim = time_embed_dim * 2
-
         self.down_blocks = nn.ModuleList([])
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
@@ -492,11 +478,6 @@ def forward(
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
             emb = emb + class_emb
 
-        if self.use_extra_film_by_addition:
-            emb = emb + self.film_embedding(class_labels)
-        elif self.use_extra_film_by_concat:
-            emb = torch.cat([emb, self.film_embedding(class_labels)], dim=-1)
-
         # 2. pre-process
         sample = self.conv_in(sample)
 
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index 6fd90b838c7c..a8527f061647 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -28,7 +28,7 @@
     HeunDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
-    UNet2DConditionModel,
+    UNet2DModel,
 )
 from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from transformers import (
@@ -90,22 +90,13 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+def renew_attention_paths(old_list):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
     """
     mapping = []
     for old_item in old_list:
         new_item = old_item
-
-        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
-
-        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
-
-        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
         mapping.append({"old": old_item, "new": new_item})
 
     return mapping
@@ -700,7 +691,6 @@ def load_pipeline_from_original_audioldm_ckpt(
     extract_ema: bool = False,
     scheduler_type: str = "pndm",
     num_in_channels: int = None,
-    upcast_attention: bool = None,
     device: str = None,
     from_safetensors: bool = False,
 ) -> AudioLDMPipeline:
@@ -728,9 +718,6 @@ def load_pipeline_from_original_audioldm_ckpt(
     checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights
             or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher
             quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
-    :param upcast_attention: Whether the attention computation should always be upcasted. This is necessary when
-    running
-                    stable diffusion 2.1.
     :param device: The device to use. Pass `None` to determine automatically. :param from_safetensors: If
     `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch. :return: A
     StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
@@ -843,13 +830,6 @@ def default_audioldm_config():
         else:
             checkpoint = torch.load(checkpoint_path, map_location=device)
 
-    # Sometimes models don't have the global_step item
-    if "global_step" in checkpoint:
-        global_step = checkpoint["global_step"]
-    else:
-        print("global_step key not found in model")
-        global_step = None
-
     if "state_dict" in checkpoint:
         checkpoint = checkpoint["state_dict"]
 
@@ -911,10 +891,9 @@ def default_audioldm_config():
     else:
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
-    # Convert the UNet2DConditionModel model.
+    # Convert the UNet2DModel
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet_config["upcast_attention"] = upcast_attention
-    unet = UNet2DConditionModel(**unet_config)
+    unet = UNet2DModel(**unet_config)
 
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(
         checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
@@ -934,13 +913,8 @@ def default_audioldm_config():
         model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
 
     if model_type == "CLAPAudioEmbeddingClassifierFreev2":
-        # TODO: Load CLAP tokenizer + model
-        # text_model = ClapTextModelWithProjection.from_pretrained("laion-ai/clap-htsat-unfused")
-        # tokenizer = AutoTokenizer.from_pretrained("laion-ai/clap-htsat-unfused")
-
-        config = ClapTextConfig(projection_dim=512, projection_hidden_act="relu")
-        text_model = ClapTextModelWithProjection(config)
-        tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+        text_model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
+        tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
 
         vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
         pipe = AudioLDMPipeline(
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 2edf7142c84c..381f21ea60ae 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -196,7 +196,7 @@ def _encode_prompt(
         self,
         prompt,
         device,
-        num_images_per_prompt,
+        num_waveforms_per_prompt,
         do_classifier_free_guidance,
         negative_prompt=None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
@@ -210,8 +210,8 @@ def _encode_prompt(
                 prompt to be encoded
             device: (`torch.device`):
                 torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
+            num_waveforms_per_prompt (`int`):
+                number of waveforms that should be generated per prompt
             do_classifier_free_guidance (`bool`):
                 whether to use classifier free guidance or not
             negative_ prompt (`str` or `List[str]`, *optional*):
@@ -264,15 +264,14 @@ def _encode_prompt(
                 text_input_ids.to(device),
                 attention_mask=attention_mask,
             )
-            text_embeds = prompt_embeds.text_embeds
-            prompt_embeds = prompt_embeds.last_hidden_state
+            prompt_embeds = prompt_embeds.text_embeds
 
         prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
 
-        bs_embed, seq_len, _ = prompt_embeds.shape
+        bs_embed, seq_len, = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_embeds = prompt_embeds.repeat(1, num_waveforms_per_prompt)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_waveforms_per_prompt, seq_len)
 
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -313,7 +312,7 @@ def _encode_prompt(
                 uncond_input.input_ids.to(device),
                 attention_mask=attention_mask,
             )
-            negative_prompt_embeds = negative_prompt_embeds.last_hidden_state
+            negative_prompt_embeds = negative_prompt_embeds.text_embeds
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
@@ -321,15 +320,15 @@ def _encode_prompt(
 
             negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
 
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_waveforms_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_waveforms_per_prompt, seq_len)
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
-        return prompt_embeds, text_embeds
+        return prompt_embeds
 
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
@@ -434,7 +433,7 @@ def __call__(
         num_inference_steps: int = 200,
         guidance_scale: float = 2.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
+        num_waveforms_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
@@ -443,7 +442,6 @@ def __call__(
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -469,8 +467,8 @@ def __call__(
                 The prompt or prompts not to guide the image generation. If not defined, one has to pass
                 `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
                 Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
+            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
+                The number of waveforms to generate per prompt.
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
@@ -500,10 +498,6 @@ def __call__(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
 
         Examples:
 
@@ -538,10 +532,10 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input prompt
-        prompt_embeds, text_embeds = self._encode_prompt(
+        prompt_embeds = self._encode_prompt(
             prompt,
             device,
-            num_images_per_prompt,
+            num_waveforms_per_prompt,
             do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
@@ -555,7 +549,7 @@ def __call__(
         # 5. Prepare latent variables
         num_channels_latents = self.unet.in_channels
         latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
+            batch_size * num_waveforms_per_prompt,
             num_channels_latents,
             height,
             width,
@@ -576,15 +570,11 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                text_embeds_input = torch.cat([text_embeds] * 2) if do_classifier_free_guidance else latents
-
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
-                    encoder_hidden_states=prompt_embeds,
-                    class_labels=text_embeds_input,
-                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=prompt_embeds,
                 ).sample
 
                 # perform guidance

From 6aa4fda37994e9cc8ff1896534b4f803f6eed259 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Feb 2023 11:39:35 +0100
Subject: [PATCH 06/66] clap, vocoder and vae

---
 src/diffusers/models/unet_2d.py               |  18 +-
 .../pipelines/audioldm/convert_from_ckpt.py   | 390 +++++++++---------
 .../pipelines/audioldm/pipeline_audioldm.py   |  28 +-
 3 files changed, 234 insertions(+), 202 deletions(-)

diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index da1212fad5f5..f8ec535964d6 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -135,14 +135,16 @@ def __init__(
 
         # film condition
         if self.class_embedding is not None and extra_film_condition_dim is not None:
-            raise ValueError("You cannot set both `class_embed_type` and `extra_film_use_concat`.")
+            raise ValueError("You cannot set both `class_embed_type` and `extra_film_condition_dim`.")
         self.use_extra_film_by_concat = extra_film_condition_dim is not None and extra_film_use_concat
-        self.use_extra_film_by_addition = extra_film_condition_dim is not None and not extra_film_use_concat
 
         if extra_film_condition_dim is not None:
             self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
+        else:
+            self.film_embedding = None
 
         if self.use_extra_film_by_concat:
+            # we're concatenating the time embeddings and film embeddings so need to double the resnet embedding dim
             time_embed_dim = time_embed_dim * 2
 
         self.down_blocks = nn.ModuleList([])
@@ -286,10 +288,14 @@ def forward(
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
             emb = emb + class_emb
 
-        if self.use_extra_film_by_addition:
-            emb = emb + self.film_embedding(class_labels)
-        elif self.use_extra_film_by_concat:
-            emb = torch.cat([emb, self.film_embedding(class_labels)], dim=-1)
+        if self.film_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when doing film embedding")
+            film_emb = self.film_embedding(class_labels).to(dtype=self.dtype)
+            if self.use_extra_film_by_concat:
+                emb = torch.cat([emb, film_emb], dim=-1)
+            else:
+                emb = emb + film_emb
 
         # 2. pre-process
         skip_sample = sample
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index a8527f061647..f3c3761043d6 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -36,6 +36,7 @@
     ClapTextConfig,
     ClapTextModelWithProjection,
     SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
 )
 
 from ...utils import is_omegaconf_available, is_safetensors_available
@@ -241,7 +242,7 @@ def create_unet_diffusers_config(original_config, image_size: int):
     return config
 
 
-def create_vae_diffusers_config(original_config, image_size: int):
+def create_vae_diffusers_config(original_config, scaling_factor: float, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
@@ -261,6 +262,7 @@ def create_vae_diffusers_config(original_config, image_size: int):
         block_out_channels=tuple(block_out_channels),
         latent_channels=vae_params.z_channels,
         layers_per_block=vae_params.num_res_blocks,
+        scaling_factor=float(scaling_factor)
     )
     return config
 
@@ -567,182 +569,68 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
 
-
-def convert_ldm_bert_checkpoint(checkpoint, config):
-    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
-        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
-        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
-        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
-
-        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
-        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
-
-    def _copy_linear(hf_linear, pt_linear):
-        hf_linear.weight = pt_linear.weight
-        hf_linear.bias = pt_linear.bias
-
-    def _copy_layer(hf_layer, pt_layer):
-        # copy layer norms
-        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
-        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
-
-        # copy attn
-        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
-
-        # copy MLP
-        pt_mlp = pt_layer[1][1]
-        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
-        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
-
-    def _copy_layers(hf_layers, pt_layers):
-        for i, hf_layer in enumerate(hf_layers):
-            if i != 0:
-                i += i
-            pt_layer = pt_layers[i : i + 2]
-            _copy_layer(hf_layer, pt_layer)
-
-    hf_model = LDMBertModel(config).eval()
-
-    # copy  embeds
-    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
-    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
-
-    # copy layer norm
-    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
-
-    # copy hidden layers
-    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
-
-    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
-
-    return hf_model
-
-
-textenc_conversion_lst = [
-    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
-    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
-    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
-    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
-]
-textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
-
-textenc_transformer_conversion_lst = [
-    # (stable-diffusion, HF Diffusers)
-    ("resblocks.", "text_model.encoder.layers."),
-    ("ln_1", "layer_norm1"),
-    ("ln_2", "layer_norm2"),
-    (".c_fc.", ".fc1."),
-    (".c_proj.", ".fc2."),
-    (".attn", ".self_attn"),
-    ("ln_final.", "transformer.text_model.final_layer_norm."),
-    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
-    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
-]
-protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
-textenc_pattern = re.compile("|".join(protected.keys()))
-
-
-def convert_laion_clap_checkpoint(checkpoint):
-    text_model = ClapTextModelWithProjection.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
-
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
-
-    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
-
-    for key in keys:
-        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
-            continue
-        if key in textenc_conversion_map:
-            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
-        if key.startswith("cond_stage_model.model.transformer."):
-            new_key = key[len("cond_stage_model.model.transformer.") :]
-            if new_key.endswith(".in_proj_weight"):
-                new_key = new_key[: -len(".in_proj_weight")]
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
-                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
-                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
-            elif new_key.endswith(".in_proj_bias"):
-                new_key = new_key[: -len(".in_proj_bias")]
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
-                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
-                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
-            else:
-                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
-
-                text_model_dict[new_key] = checkpoint[key]
-
-    text_model.load_state_dict(text_model_dict)
-
-    return text_model
-
-
-def load_pipeline_from_original_audioldm_ckpt(
-    checkpoint_path: str,
-    original_config_file: str = None,
-    image_size: int = 1024,
-    prediction_type: str = None,
-    model_type: str = None,
-    extract_ema: bool = False,
-    scheduler_type: str = "pndm",
-    num_in_channels: int = None,
-    device: str = None,
-    from_safetensors: bool = False,
-) -> AudioLDMPipeline:
+def create_transformers_diffusers_config(original_config):
     """
-    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
-    config file.
-
-    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
-    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
-    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
-
-    :param checkpoint_path: Path to `.ckpt` file. :param original_config_file: Path to `.yaml` config file
-    corresponding to the original architecture. If `None`, will be
-            automatically inferred by looking for a key that only exists in SD2.0 models.
-    :param image_size: The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable
-    Siffusion v2
-            Base. Use 768 for Stable Diffusion v2.
-    :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion
-    v1.X and Stable
-            Siffusion v2 Base. Use `'v-prediction'` for Stable Diffusion v2.
-    :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
-    inferred. :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
-    "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of
-    `["FrozenOpenClapEmbedder", "FrozenCLAPEmbedder", "PaintByExample"]`. :param extract_ema: Only relevant for
-    checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights
-            or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher
-            quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
-    :param device: The device to use. Pass `None` to determine automatically. :param from_safetensors: If
-    `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch. :return: A
-    StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
     """
+    config = dict(
+    model_in_dim = original_config.num_mels,
+    sampling_rate = original_config.sampling_rate,
+    upsample_initial_channel = original_config.upsample_initial_channel,
+    upsample_rates = list(original_config.upsample_rates),
+    upsample_kernel_sizes = list(original_config.upsample_kernel_sizes),
+    resblock_kernel_sizes = list(original_config.resblock_kernel_sizes),
+    resblock_dilation_sizes = [list(resblock_dilation) for resblock_dilation in original_config.resblock_dilation_sizes],
+    normalize_before=False,
+    )
+    return config
 
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
+def convert_hifigan_checkpoint(
+    checkpoint,
+    config,
+):
+    # extract state dict for vocoder
+    vocoder_state_dict = {}
+    vocoder_key = "first_stage_model.vocoder."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vocoder_key):
+            vocoder_state_dict[key.replace(vocoder_key, "")] = checkpoint.get(key)
+
+    for i in range(len(config.upsample_rates)):
+        vocoder_state_dict[f"upsampler.{i}.weight"] = vocoder_state_dict.pop(f"ups.{i}.weight")
+        vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
+
+    if not config.normalize_before:
+        # these are dummy variables that are unused if we don't normalize before the vocoder
+        vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
+        vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
+
+    return vocoder_state_dict
+
+def default_vocoder_config():
+    return {
+        "upsample_rates": [5, 4, 2, 2, 2],
+        "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+        "upsample_initial_channel": 1024,
+        "resblock_kernel_sizes": [3, 7, 11],
+        "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        "num_mels": 64,
+        "num_freq": 1025,
+        "n_fft": 1024,
+        "hop_size": 160,
+        "win_size": 1024,
+        "sampling_rate": 16000,
+        "fmin": 0,
+        "fmax": 8000,
+        "fmax_for_loss": None,
+    }
 
-    # TODO: remove this func for final PR
-    # Copied from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/audioldm/utils.py#L72-L73
-    def default_audioldm_config():
-        return OmegaConf.create(
-            {
-                "wave_file_save_path": "./output",
-                "id": {
-                    "version": "v1",
-                    "name": "default",
-                    "root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
-                },
+# TODO: remove this func for final PR
+# Copied from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/audioldm/utils.py#L72-L73
+def default_audioldm_config():
+    return {
                 "model": {
-                    "device": "cuda",
-                    "reload_from_ckpt": "/mnt/fast/nobackup/scratch4weeks/hl01486/exps/audio_generation/stablediffusion/LDM/audioverse/2023_01_14_full_F4_B_spatial_v2_v1/checkpoints/last.ckpt",
-                    "target": "audioldm.pipline.LatentDiffusion",
                     "params": {
                         "base_learning_rate": 5e-06,
                         "linear_start": 0.0015,
@@ -811,7 +699,122 @@ def default_audioldm_config():
                     },
                 },
             }
-        )
+
+clap_keys_to_modify_mapping = {
+    "text_branch": "text_model",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    "bn0": "batch_norm",
+}
+
+clap_keys_to_ignore = ["text_transform"]
+
+def convert_open_clap_checkpoint(checkpoint):
+    # extract state dict for VAE
+    model_state_dict = {}
+    model_key = "cond_stage_model.model.text_"
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(model_key):
+            model_state_dict[key.replace(model_key, "text_")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
+
+    for key, value in model_state_dict.items():
+        # check if key should be ignored in mapping
+        if key.split(".")[0] in clap_keys_to_ignore:
+            continue
+
+        # check if any key needs to be modified
+        for key_to_modify, new_key in clap_keys_to_modify_mapping.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
+
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
+
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
+
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
+
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+
+            new_checkpoint[key.replace("qkv", "query")] = query_layer
+            new_checkpoint[key.replace("qkv", "key")] = key_layer
+            new_checkpoint[key.replace("qkv", "value")] = value_layer
+        else:
+            new_checkpoint[key] = value
+
+    return new_checkpoint
+
+
+def load_pipeline_from_original_audioldm_ckpt(
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 1024,
+    prediction_type: str = None,
+    model_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "pndm",
+    num_in_channels: int = None,
+    device: str = None,
+    from_safetensors: bool = False,
+    original_vocoder_config_file: str = None,
+) -> AudioLDMPipeline:
+    """
+    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
+    config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    :param checkpoint_path: Path to `.ckpt` file. :param original_config_file: Path to `.yaml` config file
+    corresponding to the original architecture. If `None`, will be
+            automatically inferred by looking for a key that only exists in SD2.0 models.
+    :param image_size: The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable
+    Siffusion v2
+            Base. Use 768 for Stable Diffusion v2.
+    :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion
+    v1.X and Stable
+            Siffusion v2 Base. Use `'v-prediction'` for Stable Diffusion v2.
+    :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
+    inferred. :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
+    "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of
+    `["FrozenOpenClapEmbedder", "FrozenCLAPEmbedder", "PaintByExample"]`. :param extract_ema: Only relevant for
+    checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights
+            or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher
+            quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
+    :param device: The device to use. Pass `None` to determine automatically. :param from_safetensors: If
+    `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch. :return: A
+    StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+
+    if not is_omegaconf_available():
+        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
+
+    from omegaconf import OmegaConf
 
     if from_safetensors:
         if not is_safetensors_available():
@@ -835,6 +838,7 @@ def default_audioldm_config():
 
     if original_config_file is None:
         original_config = default_audioldm_config()
+        original_config = OmegaConf.create(original_config)
     else:
         original_config = OmegaConf.load(original_config_file)
 
@@ -902,7 +906,7 @@ def default_audioldm_config():
     unet.load_state_dict(converted_unet_checkpoint)
 
     # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+    vae_config = create_vae_diffusers_config(original_config, scaling_factor=checkpoint["scale_factor"], image_size=image_size)
     converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
 
     vae = AutoencoderKL(**vae_config)
@@ -913,17 +917,35 @@ def default_audioldm_config():
         model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
 
     if model_type == "CLAPAudioEmbeddingClassifierFreev2":
-        text_model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
+        # AudioLDM uses the same configuration and tokenizer as the original CLAP model
+        config = ClapTextConfig.from_pretrained("laion/clap-htsat-unfused")
         tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
 
-        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-        pipe = AudioLDMPipeline(
-            vae=vae,
-            text_encoder=text_model,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            vocoder=vocoder,
-        )
+        converted_text_model = convert_open_clap_checkpoint(checkpoint)
+        text_model = ClapTextModelWithProjection(config)
+        text_model.load_state_dict(converted_text_model)
+
+    # Convert the vocoder model TODO: add the vocoder config to full config
+    if original_vocoder_config_file is None:
+        original_vocoder_config = default_vocoder_config()
+        original_vocoder_config = OmegaConf.create(original_vocoder_config)
+    else:
+        original_vocoder_config = OmegaConf.load(original_vocoder_config_file)
+
+    vocoder_config = create_transformers_diffusers_config(original_vocoder_config)
+    vocoder_config = SpeechT5HifiGanConfig(**vocoder_config)
+    converted_vocoder_checkpoint = convert_hifigan_checkpoint(checkpoint, vocoder_config)
+
+    vocoder = SpeechT5HifiGan(vocoder_config)
+    vocoder.load_state_dict(converted_vocoder_checkpoint)
+
+    pipe = AudioLDMPipeline(
+        vae=vae,
+        text_encoder=text_model,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=scheduler,
+        vocoder=vocoder,
+    )
 
     return pipe
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 381f21ea60ae..817a5f2d946a 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -16,6 +16,7 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
+import torch.nn.functional as F
 
 from packaging import version
 from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
@@ -242,6 +243,7 @@ def _encode_prompt(
                 return_tensors="pt",
             )
             text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
             untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
 
             if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
@@ -255,16 +257,13 @@ def _encode_prompt(
                     f" {self.tokenizer.model_max_length} tokens: {removed_text}"
                 )
 
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
             prompt_embeds = self.text_encoder(
                 text_input_ids.to(device),
-                attention_mask=attention_mask,
+                attention_mask=attention_mask.to(device),
             )
             prompt_embeds = prompt_embeds.text_embeds
+            # additional L_2 normalization over each hidden-state
+            prompt_embeds = F.normalize(prompt_embeds, dim=-1)
 
         prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
 
@@ -303,16 +302,16 @@ def _encode_prompt(
                 return_tensors="pt",
             )
 
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
+            uncond_input_ids = uncond_input.input_ids.to(device)
+            attention_mask = uncond_input.attention_mask.to(device)
 
             negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
+                uncond_input_ids,
                 attention_mask=attention_mask,
             )
             negative_prompt_embeds = negative_prompt_embeds.text_embeds
+            # additional L_2 normalization over each hidden-state
+            negative_prompt_embeds = F.normalize(negative_prompt_embeds, dim=-1)
 
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
@@ -336,7 +335,12 @@ def decode_latents(self, latents):
         return mel_spectrogram
 
     def mel_spectrogram_to_waveform(self, mel_spectrogram):
-        mel_spectrogram = mel_spectrogram.permute(0, 2, 1)
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+
+        mel_spectrogram = mel_spectrogram.squeeze(1)
+        # TODO: our transformers implementation can't handle batching! Squeeze tensor from [bsz, seq_len, mel_bins] to [seq_len, mel_bins]
+        mel_spectrogram = mel_spectrogram.squeeze(0)
         waveform = self.vocoder(mel_spectrogram)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
         waveform = waveform.cpu().detach().numpy()

From 2482b42d0a750ab277b80aa655a59a10282c3ed1 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 20 Feb 2023 13:39:00 +0100
Subject: [PATCH 07/66] clean-up: conversion scripts

---
 .../convert_original_audioldm_to_diffusers.py |  15 +-
 .../pipelines/audioldm/convert_from_ckpt.py   | 365 ++++++++----------
 .../pipelines/audioldm/pipeline_audioldm.py   |   4 +-
 3 files changed, 172 insertions(+), 212 deletions(-)

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 2c2d6ddb4d63..9d0684cb27cc 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
+# Copyright 2023 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the LDM checkpoints. """
+""" Conversion script for the AudioLDM checkpoints. """
 
 import argparse
 
@@ -59,8 +59,7 @@
         default=None,
         type=int,
         help=(
-            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2"
-            " Base. Use 768 for Stable Diffusion v2."
+            "The image size that the model was trained on."
         ),
     )
     parser.add_argument(
@@ -68,13 +67,12 @@
         default=None,
         type=str,
         help=(
-            "The prediction type that the model was trained on. Use 'epsilon' for Stable Diffusion v1.X and Stable"
-            " Siffusion v2 Base. Use 'v-prediction' for Stable Diffusion v2."
+            "The prediction type that the model was trained on."
         ),
     )
     parser.add_argument(
         "--extract_ema",
-        action="store_true",
+        action="store_false",  # TODO: revert to store_true
         help=(
             "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
             " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
@@ -93,7 +91,7 @@
     )
     parser.add_argument(
         "--dump_path",
-        default="/Users/sanchitgandhi/convert-audioldm/diffusers_out",
+        default="/Users/sanchitgandhi/convert-audioldm/diffusers_out_2",
         type=str,
         required=False,  # TODO: revert to True
         help="Path to the output model.",
@@ -106,7 +104,6 @@
         original_config_file=args.original_config_file,
         image_size=args.image_size,
         prediction_type=args.prediction_type,
-        model_type=args.pipeline_type,
         extract_ema=args.extract_ema,
         scheduler_type=args.scheduler_type,
         num_in_channels=args.num_in_channels,
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index f3c3761043d6..743e8ca94fef 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
+# Copyright 2023 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the Stable Diffusion checkpoints."""
+""" Conversion script for the AudioLDM checkpoints."""
 
 import re
 
@@ -30,7 +30,6 @@
     PNDMScheduler,
     UNet2DModel,
 )
-from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from transformers import (
     AutoTokenizer,
     ClapTextConfig,
@@ -43,6 +42,7 @@
 from ...utils.import_utils import BACKENDS_MAPPING
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.shave_segments
 def shave_segments(path, n_shave_prefix_segments=1):
     """
     Removes segments. Positive values shave the first segments, negative shave the last segments.
@@ -53,6 +53,7 @@ def shave_segments(path, n_shave_prefix_segments=1):
         return ".".join(path.split(".")[:n_shave_prefix_segments])
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.renew_resnet_paths
 def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside resnets to the new naming scheme (local renaming)
@@ -75,6 +76,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.renew_vae_resnet_paths
 def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside resnets to the new naming scheme (local renaming)
@@ -91,6 +93,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.renew_attention_paths
 def renew_attention_paths(old_list):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
@@ -103,6 +106,7 @@ def renew_attention_paths(old_list):
     return mapping
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.renew_vae_attention_paths
 def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
@@ -133,6 +137,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.assign_to_checkpoint
 def assign_to_checkpoint(
     paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
 ):
@@ -184,6 +189,7 @@ def assign_to_checkpoint(
             checkpoint[new_path] = old_checkpoint[path["old"]]
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.convert_attn_to_linear
 def conv_attn_to_linear(checkpoint):
     keys = list(checkpoint.keys())
     attn_keys = ["query.weight", "key.weight", "value.weight"]
@@ -198,7 +204,7 @@ def conv_attn_to_linear(checkpoint):
 
 def create_unet_diffusers_config(original_config, image_size: int):
     """
-    Creates a config for the diffusers based on the config of the LDM model.
+    Creates a UNet config for diffusers based on the config of the original AudioLDM model.
     """
     unet_params = original_config.model.params.unet_config.params
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
@@ -234,7 +240,7 @@ def create_unet_diffusers_config(original_config, image_size: int):
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=True,
+        cross_attention_dim=True,  # TODO: hacky - what are we doing re cross attention?
         extra_film_condition_dim=extra_film_condition_dim,
         extra_film_use_concat=extra_film_use_concat,
     )
@@ -242,9 +248,12 @@ def create_unet_diffusers_config(original_config, image_size: int):
     return config
 
 
-def create_vae_diffusers_config(original_config, scaling_factor: float, image_size: int):
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.create_vae_diffusers_config
+def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
     """
-    Creates a config for the diffusers based on the config of the LDM model.
+    Creates a VAE config for diffusers based on the config of the original AudioLDM model.
+    Compared to the original Stable Diffusion conversion, this function passes a
+    *learnt* VAE scaling factor to the diffusers VAE.
     """
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
     _ = original_config.model.params.first_stage_config.params.embed_dim
@@ -253,6 +262,8 @@ def create_vae_diffusers_config(original_config, scaling_factor: float, image_si
     down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
     up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
 
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config.model.params else 0.18215
+
     config = dict(
         sample_size=image_size,
         in_channels=vae_params.in_channels,
@@ -262,11 +273,12 @@ def create_vae_diffusers_config(original_config, scaling_factor: float, image_si
         block_out_channels=tuple(block_out_channels),
         latent_channels=vae_params.z_channels,
         layers_per_block=vae_params.num_res_blocks,
-        scaling_factor=float(scaling_factor)
+        scaling_factor=float(scaling_factor),
     )
     return config
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.create_diffusers_schedular
 def create_diffusers_schedular(original_config):
     schedular = DDIMScheduler(
         num_train_timesteps=original_config.model.params.timesteps,
@@ -277,19 +289,12 @@ def create_diffusers_schedular(original_config):
     return schedular
 
 
-def create_ldm_bert_config(original_config):
-    bert_params = original_config.model.parms.cond_stage_config.params
-    config = LDMBertConfig(
-        d_model=bert_params.n_embed,
-        encoder_layers=bert_params.n_layer,
-        encoder_ffn_dim=bert_params.n_embed * 4,
-    )
-    return config
-
-
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.convert_ldm_unet_checkpoint
 def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
     """
-    Takes a state dict and a config, and returns a converted checkpoint.
+    Takes a state dict and a config, and returns a converted checkpoint. Compared to the
+    original Stable Diffusion conversion, this function additionally converts the learnt
+    film embedding linear layer.
     """
 
     # extract state_dict for UNet
@@ -463,6 +468,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     return new_checkpoint
 
 
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.convert_ldm_vae_checkpoint
 def convert_ldm_vae_checkpoint(checkpoint, config):
     # extract state dict for VAE
     vae_state_dict = {}
@@ -569,26 +575,101 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     conv_attn_to_linear(new_checkpoint)
     return new_checkpoint
 
-def create_transformers_diffusers_config(original_config):
+
+CLAP_KEYS_TO_MODIFY_MAPPING = {
+    "text_branch": "text_model",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    "bn0": "batch_norm",
+}
+
+CLAP_KEYS_TO_IGNORE = ["text_transform"]
+
+def convert_open_clap_checkpoint(checkpoint):
+    """
+    Takes a state dict and returns a converted CLAP checkpoint.
+    """
+    # extract state dict for CLAP text embedding model, discarding the audio component
+    model_state_dict = {}
+    model_key = "cond_stage_model.model.text_"
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(model_key):
+            model_state_dict[key.replace(model_key, "text_")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
+
+    for key, value in model_state_dict.items():
+        # check if key should be ignored in mapping
+        if key.split(".")[0] in CLAP_KEYS_TO_IGNORE:
+            continue
+
+        # check if any key needs to be modified
+        for key_to_modify, new_key in CLAP_KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
+
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
+
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
+
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
+
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+
+            new_checkpoint[key.replace("qkv", "query")] = query_layer
+            new_checkpoint[key.replace("qkv", "key")] = key_layer
+            new_checkpoint[key.replace("qkv", "value")] = value_layer
+        else:
+            new_checkpoint[key] = value
+
+    return new_checkpoint
+
+def create_transformers_vocoder_config(original_config):
     """
     Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
     """
+    vocoder_params = original_config.model.params.vocoder_config.params
+
     config = dict(
-    model_in_dim = original_config.num_mels,
-    sampling_rate = original_config.sampling_rate,
-    upsample_initial_channel = original_config.upsample_initial_channel,
-    upsample_rates = list(original_config.upsample_rates),
-    upsample_kernel_sizes = list(original_config.upsample_kernel_sizes),
-    resblock_kernel_sizes = list(original_config.resblock_kernel_sizes),
-    resblock_dilation_sizes = [list(resblock_dilation) for resblock_dilation in original_config.resblock_dilation_sizes],
+    model_in_dim = vocoder_params.num_mels,
+    sampling_rate = vocoder_params.sampling_rate,
+    upsample_initial_channel = vocoder_params.upsample_initial_channel,
+    upsample_rates = list(vocoder_params.upsample_rates),
+    upsample_kernel_sizes = list(vocoder_params.upsample_kernel_sizes),
+    resblock_kernel_sizes = list(vocoder_params.resblock_kernel_sizes),
+    resblock_dilation_sizes = [list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes],
     normalize_before=False,
     )
+
     return config
 
-def convert_hifigan_checkpoint(
-    checkpoint,
-    config,
-):
+def convert_hifigan_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and config, and returns a converted HiFiGAN vocoder checkpoint.
+    """
     # extract state dict for vocoder
     vocoder_state_dict = {}
     vocoder_key = "first_stage_model.vocoder."
@@ -597,60 +678,31 @@ def convert_hifigan_checkpoint(
         if key.startswith(vocoder_key):
             vocoder_state_dict[key.replace(vocoder_key, "")] = checkpoint.get(key)
 
+    # fix upsampler keys, everything else is correct already
     for i in range(len(config.upsample_rates)):
         vocoder_state_dict[f"upsampler.{i}.weight"] = vocoder_state_dict.pop(f"ups.{i}.weight")
         vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
 
     if not config.normalize_before:
-        # these are dummy variables that are unused if we don't normalize before the vocoder
+        # if we don't normalize before these variables are unused, so we set them to arbitrary values
+        # TODO: fix this in transformers
         vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
         vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
 
     return vocoder_state_dict
 
-def default_vocoder_config():
-    return {
-        "upsample_rates": [5, 4, 2, 2, 2],
-        "upsample_kernel_sizes": [16, 16, 8, 4, 4],
-        "upsample_initial_channel": 1024,
-        "resblock_kernel_sizes": [3, 7, 11],
-        "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-        "num_mels": 64,
-        "num_freq": 1025,
-        "n_fft": 1024,
-        "hop_size": 160,
-        "win_size": 1024,
-        "sampling_rate": 16000,
-        "fmin": 0,
-        "fmax": 8000,
-        "fmax_for_loss": None,
-    }
-
-# TODO: remove this func for final PR
-# Copied from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/audioldm/utils.py#L72-L73
-def default_audioldm_config():
-    return {
+# Adapted from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/audioldm/utils.py#L72-L73
+DEFAULT_CONFIG = {
                 "model": {
                     "params": {
-                        "base_learning_rate": 5e-06,
                         "linear_start": 0.0015,
                         "linear_end": 0.0195,
-                        "num_timesteps_cond": 1,
-                        "log_every_t": 200,
                         "timesteps": 1000,
-                        "first_stage_key": "fbank",
-                        "cond_stage_key": "waveform",
-                        "latent_t_size": 256,
-                        "latent_f_size": 16,
                         "channels": 8,
-                        "cond_stage_trainable": True,
-                        "conditioning_key": "film",
-                        "monitor": "val/loss_simple_ema",
                         "scale_by_std": True,
                         "unet_config": {
                             "target": "audioldm.latent_diffusion.openaimodel.UNetModel",
                             "params": {
-                                "image_size": 64,
                                 "extra_film_condition_dim": 512,
                                 "extra_film_use_concat": True,
                                 "in_channels": 8,
@@ -660,155 +712,76 @@ def default_audioldm_config():
                                 "num_res_blocks": 2,
                                 "channel_mult": [1, 2, 3, 5],
                                 "num_head_channels": 32,
-                                "use_spatial_transformer": True,
                             },
                         },
                         "first_stage_config": {
-                            "base_learning_rate": 4.5e-05,
                             "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
                             "params": {
-                                "monitor": "val/rec_loss",
-                                "image_key": "fbank",
-                                "subband": 1,
                                 "embed_dim": 8,
-                                "time_shuffle": 1,
                                 "ddconfig": {
-                                    "double_z": True,
                                     "z_channels": 8,
                                     "resolution": 256,
-                                    "downsample_time": False,
                                     "in_channels": 1,
                                     "out_ch": 1,
                                     "ch": 128,
                                     "ch_mult": [1, 2, 4],
                                     "num_res_blocks": 2,
-                                    "attn_resolutions": [],
-                                    "dropout": 0.0,
                                 },
                             },
                         },
-                        "cond_stage_config": {
-                            "target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2",
+                        "vocoder_config": {
+                            "target": "audioldm.first_stage_model.vocoder",
                             "params": {
-                                "key": "waveform",
+                                "upsample_rates": [5, 4, 2, 2, 2],
+                                "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+                                "upsample_initial_channel": 1024,
+                                "resblock_kernel_sizes": [3, 7, 11],
+                                "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                                "num_mels": 64,
                                 "sampling_rate": 16000,
-                                "embed_mode": "audio",
-                                "unconditional_prob": 0.1,
-                            },
-                        },
+                            }
+
+                        }
                     },
                 },
             }
 
-clap_keys_to_modify_mapping = {
-    "text_branch": "text_model",
-    "attn": "attention.self",
-    "self.proj": "output.dense",
-    "attention.self_mask": "attn_mask",
-    "mlp.fc1": "intermediate.dense",
-    "mlp.fc2": "output.dense",
-    "norm1": "layernorm_before",
-    "norm2": "layernorm_after",
-    "bn0": "batch_norm",
-}
-
-clap_keys_to_ignore = ["text_transform"]
-
-def convert_open_clap_checkpoint(checkpoint):
-    # extract state dict for VAE
-    model_state_dict = {}
-    model_key = "cond_stage_model.model.text_"
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(model_key):
-            model_state_dict[key.replace(model_key, "text_")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    sequential_layers_pattern = r".*sequential.(\d+).*"
-    text_projection_pattern = r".*_projection.(\d+).*"
-
-    for key, value in model_state_dict.items():
-        # check if key should be ignored in mapping
-        if key.split(".")[0] in clap_keys_to_ignore:
-            continue
-
-        # check if any key needs to be modified
-        for key_to_modify, new_key in clap_keys_to_modify_mapping.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        if re.match(sequential_layers_pattern, key):
-            # replace sequential layers with list
-            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
-
-            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
-        elif re.match(text_projection_pattern, key):
-            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
-
-            # Because in CLAP they use `nn.Sequential`...
-            transformers_projection_layer = 1 if projecton_layer == 0 else 2
-
-            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
-
-        if "audio" and "qkv" in key:
-            # split qkv into query key and value
-            mixed_qkv = value
-            qkv_dim = mixed_qkv.size(0) // 3
-
-            query_layer = mixed_qkv[:qkv_dim]
-            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-            value_layer = mixed_qkv[qkv_dim * 2 :]
-
-            new_checkpoint[key.replace("qkv", "query")] = query_layer
-            new_checkpoint[key.replace("qkv", "key")] = key_layer
-            new_checkpoint[key.replace("qkv", "value")] = value_layer
-        else:
-            new_checkpoint[key] = value
-
-    return new_checkpoint
-
-
 def load_pipeline_from_original_audioldm_ckpt(
     checkpoint_path: str,
     original_config_file: str = None,
-    image_size: int = 1024,
+    image_size: int = 512,
     prediction_type: str = None,
-    model_type: str = None,
     extract_ema: bool = False,
-    scheduler_type: str = "pndm",
+    scheduler_type: str = "ddim",
     num_in_channels: int = None,
     device: str = None,
     from_safetensors: bool = False,
-    original_vocoder_config_file: str = None,
 ) -> AudioLDMPipeline:
     """
-    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
-    config file.
+    Load an AudioLDM pipeline object from a `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file.
 
     Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
     global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
     recommended that you override the default values and/or supply an `original_config_file` wherever possible.
 
-    :param checkpoint_path: Path to `.ckpt` file. :param original_config_file: Path to `.yaml` config file
-    corresponding to the original architecture. If `None`, will be
-            automatically inferred by looking for a key that only exists in SD2.0 models.
-    :param image_size: The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable
-    Siffusion v2
-            Base. Use 768 for Stable Diffusion v2.
-    :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion
-    v1.X and Stable
-            Siffusion v2 Base. Use `'v-prediction'` for Stable Diffusion v2.
+    :param checkpoint_path: Path to `.ckpt` file.
+    :param original_config_file: Path to `.yaml` config file corresponding to the original architecture.
+            If `None`, will be automatically instantiated based on default values.
+    :param image_size: The image size that the model was trained on. Use 512 for original AudioLDM checkpoints.
+    :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for original
+            AudioLDM checkpoints.
     :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
-    inferred. :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
-    "euler-ancestral", "dpm", "ddim"]`. :param model_type: The pipeline type. `None` to automatically infer, or one of
-    `["FrozenOpenClapEmbedder", "FrozenCLAPEmbedder", "PaintByExample"]`. :param extract_ema: Only relevant for
-    checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights
-            or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher
-            quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
-    :param device: The device to use. Pass `None` to determine automatically. :param from_safetensors: If
-    `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch. :return: A
-    StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+            inferred.
+    :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
+            "euler-ancestral", "dpm", "ddim"]`.
+    :param extract_ema: Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract
+            the EMA weights or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights
+            usually yield higher quality images for inference. Non-EMA weights are usually better to continue
+            fine-tuning.
+    :param device: The device to use. Pass `None` to determine automatically.
+    :param from_safetensors: If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors
+            instead of PyTorch.
+    :return: An AudioLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
     """
 
     if not is_omegaconf_available():
@@ -837,7 +810,7 @@ def load_pipeline_from_original_audioldm_ckpt(
         checkpoint = checkpoint["state_dict"]
 
     if original_config_file is None:
-        original_config = default_audioldm_config()
+        original_config = DEFAULT_CONFIG
         original_config = OmegaConf.create(original_config)
     else:
         original_config = OmegaConf.load(original_config_file)
@@ -851,13 +824,12 @@ def load_pipeline_from_original_audioldm_ckpt(
     ):
         if prediction_type is None:
             prediction_type = "v_prediction"
-        if image_size is None:
-            image_size = 1024
     else:
         if prediction_type is None:
             prediction_type = "epsilon"
-        if image_size is None:
-            image_size = 1024
+
+    if image_size is None:
+        image_size = 512
 
     num_train_timesteps = original_config.model.params.timesteps
     beta_start = original_config.model.params.linear_start
@@ -905,40 +877,31 @@ def load_pipeline_from_original_audioldm_ckpt(
 
     unet.load_state_dict(converted_unet_checkpoint)
 
-    # Convert the VAE model.
-    vae_config = create_vae_diffusers_config(original_config, scaling_factor=checkpoint["scale_factor"], image_size=image_size)
+    # Convert the VAE model
+    vae_config = create_vae_diffusers_config(original_config, checkpoint=checkpoint, image_size=image_size)
     converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
 
     vae = AutoencoderKL(**vae_config)
     vae.load_state_dict(converted_vae_checkpoint)
 
-    # Convert the text model.
-    if model_type is None:
-        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-
-    if model_type == "CLAPAudioEmbeddingClassifierFreev2":
-        # AudioLDM uses the same configuration and tokenizer as the original CLAP model
-        config = ClapTextConfig.from_pretrained("laion/clap-htsat-unfused")
-        tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+    # Convert the text model
+    # AudioLDM uses the same configuration and tokenizer as the original CLAP model
+    config = ClapTextConfig.from_pretrained("laion/clap-htsat-unfused")
+    tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
 
-        converted_text_model = convert_open_clap_checkpoint(checkpoint)
-        text_model = ClapTextModelWithProjection(config)
-        text_model.load_state_dict(converted_text_model)
-
-    # Convert the vocoder model TODO: add the vocoder config to full config
-    if original_vocoder_config_file is None:
-        original_vocoder_config = default_vocoder_config()
-        original_vocoder_config = OmegaConf.create(original_vocoder_config)
-    else:
-        original_vocoder_config = OmegaConf.load(original_vocoder_config_file)
+    converted_text_model = convert_open_clap_checkpoint(checkpoint)
+    text_model = ClapTextModelWithProjection(config)
+    text_model.load_state_dict(converted_text_model)
 
-    vocoder_config = create_transformers_diffusers_config(original_vocoder_config)
+    # Convert the vocoder model
+    vocoder_config = create_transformers_vocoder_config(original_config)
     vocoder_config = SpeechT5HifiGanConfig(**vocoder_config)
     converted_vocoder_checkpoint = convert_hifigan_checkpoint(checkpoint, vocoder_config)
 
     vocoder = SpeechT5HifiGan(vocoder_config)
     vocoder.load_state_dict(converted_vocoder_checkpoint)
 
+    # Instantiate the diffusers pipeline
     pipe = AudioLDMPipeline(
         vae=vae,
         text_encoder=text_model,
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 817a5f2d946a..efe35aa1aea9 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -22,7 +22,7 @@
 from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
 
 from ...configuration_utils import FrozenDict
-from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models import AutoencoderKL, UNet2DModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import deprecate, is_accelerate_available, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
@@ -78,7 +78,7 @@ def __init__(
         vae: AutoencoderKL,
         text_encoder: ClapTextModelWithProjection,
         tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
-        unet: UNet2DConditionModel,
+        unet: UNet2DModel,
         scheduler: KarrasDiffusionSchedulers,
         vocoder: SpeechT5HifiGan,
     ):

From 9d986c4745b5111e9bde341970b98b5fdd0c3bb8 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 20 Feb 2023 17:09:45 +0100
Subject: [PATCH 08/66] fix: conversion script token_type_ids

---
 .../pipelines/audioldm/convert_from_ckpt.py         | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index 743e8ca94fef..dd58f5c3dac5 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -590,6 +590,8 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
 CLAP_KEYS_TO_IGNORE = ["text_transform"]
 
+CLAP_EXPECTED_MISSING_KEYS = ["text_model.embeddings.token_type_ids"]
+
 def convert_open_clap_checkpoint(checkpoint):
     """
     Takes a state dict and returns a converted CLAP checkpoint.
@@ -891,7 +893,16 @@ def load_pipeline_from_original_audioldm_ckpt(
 
     converted_text_model = convert_open_clap_checkpoint(checkpoint)
     text_model = ClapTextModelWithProjection(config)
-    text_model.load_state_dict(converted_text_model)
+
+    missing_keys, unexpected_keys = text_model.load_state_dict(converted_text_model, strict=False)
+    # we expect not to have token_type_ids in our original state dict so let's ignore them
+    missing_keys = list(set(missing_keys) - set(CLAP_EXPECTED_MISSING_KEYS))
+
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected keys when loading CLAP model: {unexpected_keys}")
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing keys when loading CLAP model: {missing_keys}")
 
     # Convert the vocoder model
     vocoder_config = create_transformers_vocoder_config(original_config)

From 004fed8e7551d7a5dd2ce6c61ff3846b81e7dc1f Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 20 Feb 2023 17:09:57 +0100
Subject: [PATCH 09/66] clean-up: pipeline docstring

---
 .../pipelines/audioldm/pipeline_audioldm.py   | 66 ++++++++-----------
 1 file changed, 29 insertions(+), 37 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index efe35aa1aea9..51f29d56a9d6 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -1,4 +1,4 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Callable, List, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -30,18 +30,18 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-# TODO: update doc string for AudioLDM
+# TODO: update doc string with checkpoint path
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import StableDiffusionPipeline
+        >>> from diffusers import AudioLDMPipeline
 
-        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+        >>> pipe = AudioLDMPipeline.from_pretrained("org/audioldm-checkpoint", torch_dtype=torch.float16)
         >>> pipe = pipe.to("cuda")
 
-        >>> prompt = "a photo of an astronaut riding a horse on mars"
-        >>> image = pipe(prompt).images[0]
+        >>> prompt = "A hammer hitting a wooden surface"
+        >>> audio = pipe(prompt).audio[0]
         ```
 """
 
@@ -55,23 +55,22 @@ class AudioLDMPipeline(DiffusionPipeline):
 
     Args:
         vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+            Variational Auto-Encoder (VAE) Model to encode and decode audios to and from latent representations.
         text_encoder ([`ClapTextModelWithProjection`]):
             Frozen text-encoder. AudioLDM uses the text portion of
-            [CLAP](https://huggingface.co/docs/transformers/model_doc/clap#transformers.ClapTextModelWithProjection),
-            specifically the (TODO) variant.
+            [CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap#transformers.ClapTextModelWithProjection),
+            specifically the [RoBERTa HSTAT-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
         tokenizer ([`PreTrainedTokenizer`]):
             Tokenizer of class
-            [RobertaTokenizer](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/clap#transformers.RobertaTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+            [RobertaTokenizer](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaTokenizer).
+        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded audio latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        vocoder (`SpeechT5HifiGan`):
+        vocoder ([`SpeechT5HifiGan`]):
             Vocoder of class
-            [SpeechT5HifiGan](https://huggingface.co/docs/transformers/v4.27.0/en/model_doc/speecht5#transformers.SpeechT5HifiGan).
+            [SpeechT5HifiGan](https://huggingface.co/docs/transformers/main/en/model_doc/speecht5#transformers.SpeechT5HifiGan).
     """
-    _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
         self,
@@ -119,9 +118,7 @@ def __init__(
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
                 " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " following: \n- org/audioldm-checkpoint \n you should change 'sample_size' to 64 in the"
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
@@ -207,16 +204,16 @@ def _encode_prompt(
         Encodes the prompt into text encoder hidden states.
 
         Args:
-             prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `List[str]`, *optional*):
                 prompt to be encoded
-            device: (`torch.device`):
+            device (`torch.device`):
                 torch device
             num_waveforms_per_prompt (`int`):
                 number of waveforms that should be generated per prompt
             do_classifier_free_guidance (`bool`):
                 whether to use classifier free guidance or not
-            negative_ prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
                 `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
                 Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
             prompt_embeds (`torch.FloatTensor`, *optional*):
@@ -452,23 +449,23 @@ def __call__(
 
         Args:
             prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                The prompt or prompts to guide the audio generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image.
+                The height in pixels of the generated audio.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image.
+                The width in pixels of the generated audio.
             num_inference_steps (`int`, *optional*, defaults to 200):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
                 expense of slower inference.
             guidance_scale (`float`, *optional*, defaults to 2.5):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
+                1`. Higher guidance scale encourages to generate audios that are closely linked to the text `prompt`,
+                usually at the expense of lower sound quality.
             negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
                 `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
                 Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
             num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
@@ -480,7 +477,7 @@ def __call__(
                 One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                 to make generation deterministic.
             latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for audio
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
             prompt_embeds (`torch.FloatTensor`, *optional*):
@@ -490,9 +487,6 @@ def __call__(
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                 argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                 plain tuple.
@@ -508,9 +502,7 @@ def __call__(
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
+            When returning a tuple, the first element is a list with the generated audios.
         """
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor

From 9feb6ba3ff66eece146f3dabc81e5c117510c4e0 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 20 Feb 2023 17:22:50 +0100
Subject: [PATCH 10/66] tests: from SD

---
 tests/pipelines/audioldm/__init__.py      |   0
 tests/pipelines/audioldm/test_audioldm.py | 902 ++++++++++++++++++++++
 2 files changed, 902 insertions(+)
 create mode 100644 tests/pipelines/audioldm/__init__.py
 create mode 100644 tests/pipelines/audioldm/test_audioldm.py

diff --git a/tests/pipelines/audioldm/__init__.py b/tests/pipelines/audioldm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
new file mode 100644
index 000000000000..02774d69dc29
--- /dev/null
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -0,0 +1,902 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import gc
+import tempfile
+import time
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    logging,
+)
+from diffusers.utils import load_numpy, nightly, slow, torch_device
+from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
+
+from ...models.test_models_unet_2d_condition import create_lora_layers
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = StableDiffusionPipeline
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "numpy",
+        }
+        return inputs
+
+    def test_stable_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5643, 0.6017, 0.4799, 0.5267, 0.5584, 0.4641, 0.5159, 0.4963, 0.4791])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_lora(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward 1
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        # set lora layers
+        lora_attn_procs = create_lora_layers(sd_pipe.unet)
+        sd_pipe.unet.set_attn_processor(lora_attn_procs)
+        sd_pipe = sd_pipe.to(torch_device)
+
+        # forward 2
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})
+        image = output.images
+        image_slice_1 = image[0, -3:, -3:, -1]
+
+        # forward 3
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})
+        image = output.images
+        image_slice_2 = image[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice - image_slice_1).max() < 1e-2
+        assert np.abs(image_slice - image_slice_2).max() > 1e-2
+
+    def test_stable_diffusion_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        text_inputs = sd_pipe.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=sd_pipe.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_inputs = text_inputs["input_ids"].to(torch_device)
+
+        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
+
+        inputs["prompt_embeds"] = prompt_embeds
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_negative_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        negative_prompt = 3 * ["this is a negative prompt"]
+        inputs["negative_prompt"] = negative_prompt
+        inputs["prompt"] = 3 * [inputs["prompt"]]
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 3 * [inputs.pop("prompt")]
+
+        embeds = []
+        for p in [prompt, negative_prompt]:
+            text_inputs = sd_pipe.tokenizer(
+                p,
+                padding="max_length",
+                max_length=sd_pipe.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_inputs = text_inputs["input_ids"].to(torch_device)
+
+            embeds.append(sd_pipe.text_encoder(text_inputs)[0])
+
+        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
+
+        # forward
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_ddim_factor_8(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs, height=136, width=136)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 136, 136, 3)
+        expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5094, 0.5674, 0.4667, 0.5125, 0.5696, 0.4674, 0.5277, 0.4964, 0.4945])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_no_safety_checker(self):
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
+        assert isinstance(pipe, StableDiffusionPipeline)
+        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+    def test_stable_diffusion_k_lms(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [
+                0.47082293033599854,
+                0.5371589064598083,
+                0.4562119245529175,
+                0.5220914483070374,
+                0.5733777284622192,
+                0.4795039892196655,
+                0.5465868711471558,
+                0.5074326395988464,
+                0.5042197108268738,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [
+                0.4707113206386566,
+                0.5372191071510315,
+                0.4563021957874298,
+                0.5220003724098206,
+                0.5734264850616455,
+                0.4794946610927582,
+                0.5463782548904419,
+                0.5074145197868347,
+                0.504422664642334,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [
+                0.47082313895225525,
+                0.5371587872505188,
+                0.4562119245529175,
+                0.5220913887023926,
+                0.5733776688575745,
+                0.47950395941734314,
+                0.546586811542511,
+                0.5074326992034912,
+                0.5042197108268738,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_vae_slicing(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        image_count = 4
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * image_count
+        output_1 = sd_pipe(**inputs)
+
+        # make sure sliced vae decode yields the same result
+        sd_pipe.enable_vae_slicing()
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * image_count
+        output_2 = sd_pipe(**inputs)
+
+        # there is a small discrepancy at image borders vs. full batch decode
+        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3
+
+    def test_stable_diffusion_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [
+                0.5108221173286438,
+                0.5688379406929016,
+                0.4685141146183014,
+                0.5098261833190918,
+                0.5657756328582764,
+                0.4631010890007019,
+                0.5226285457611084,
+                0.49129390716552734,
+                0.4899061322212219,
+            ]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_num_images_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        # test num_images_per_prompt=1 (default)
+        images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
+
+        assert images.shape == (1, 64, 64, 3)
+
+        # test num_images_per_prompt=1 (default) for batch of prompts
+        batch_size = 2
+        images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
+
+        assert images.shape == (batch_size, 64, 64, 3)
+
+        # test num_images_per_prompt for single prompt
+        num_images_per_prompt = 2
+        images = sd_pipe(
+            prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
+        ).images
+
+        assert images.shape == (num_images_per_prompt, 64, 64, 3)
+
+        # test num_images_per_prompt for batch of prompts
+        batch_size = 2
+        images = sd_pipe(
+            [prompt] * batch_size, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
+        ).images
+
+        assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
+
+    def test_stable_diffusion_long_prompt(self):
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        do_classifier_free_guidance = True
+        negative_prompt = None
+        num_images_per_prompt = 1
+        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+
+        prompt = 25 * "@"
+        with CaptureLogger(logger) as cap_logger_3:
+            text_embeddings_3 = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        prompt = 100 * "@"
+        with CaptureLogger(logger) as cap_logger:
+            text_embeddings = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        negative_prompt = "Hello"
+        with CaptureLogger(logger) as cap_logger_2:
+            text_embeddings_2 = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
+        assert text_embeddings.shape[1] == 77
+
+        assert cap_logger.out == cap_logger_2.out
+        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
+        assert cap_logger.out.count("@") == 25
+        assert cap_logger_3.out == ""
+
+    def test_stable_diffusion_height_width_opt(self):
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        sd_pipe = StableDiffusionPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "hey"
+
+        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == (64, 64)
+
+        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == (96, 96)
+
+        config = dict(sd_pipe.unet.config)
+        config["sample_size"] = 96
+        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
+        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == (192, 192)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionPipelineSlowTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 3,
+            "guidance_scale": 7.5,
+            "output_type": "numpy",
+        }
+        return inputs
+
+    def test_stable_diffusion_1_1_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_1_4_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_ddim(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_lms(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_dpm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1].flatten()
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
+        assert np.abs(image_slice - expected_slice).max() < 1e-4
+
+    def test_stable_diffusion_attention_slicing(self):
+        torch.cuda.reset_peak_memory_stats()
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # enable attention slicing
+        pipe.enable_attention_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image_sliced = pipe(**inputs).images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 3.75 GB is allocated
+        assert mem_bytes < 3.75 * 10**9
+
+        # disable slicing
+        pipe.disable_attention_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image = pipe(**inputs).images
+
+        # make sure that more than 3.75 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 3.75 * 10**9
+        assert np.abs(image_sliced - image).max() < 1e-3
+
+    def test_stable_diffusion_vae_slicing(self):
+        torch.cuda.reset_peak_memory_stats()
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        # enable vae slicing
+        pipe.enable_vae_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        inputs["prompt"] = [inputs["prompt"]] * 4
+        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
+        image_sliced = pipe(**inputs).images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 4 GB is allocated
+        assert mem_bytes < 4e9
+
+        # disable vae slicing
+        pipe.disable_vae_slicing()
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        inputs["prompt"] = [inputs["prompt"]] * 4
+        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
+        image = pipe(**inputs).images
+
+        # make sure that more than 4 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 4e9
+        # There is a small discrepancy at the image borders vs. a fully batched version.
+        assert np.abs(image_sliced - image).max() < 1e-2
+
+    def test_stable_diffusion_fp16_vs_autocast(self):
+        # this test makes sure that the original model with autocast
+        # and the new model with fp16 yield the same result
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        image_fp16 = pipe(**inputs).images
+
+        with torch.autocast(torch_device):
+            inputs = self.get_inputs(torch_device)
+            image_autocast = pipe(**inputs).images
+
+        # Make sure results are close enough
+        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
+        # They ARE different since ops are not run always at the same precision
+        # however, they should be extremely close.
+        assert diff.mean() < 2e-2
+
+    def test_stable_diffusion_intermediate_state(self):
+        number_of_steps = 0
+
+        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
+            callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 1:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
+                )
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+            elif step == 2:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
+                )
+
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+
+        callback_fn.has_been_called = False
+
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        pipe(**inputs, callback=callback_fn, callback_steps=1)
+        assert callback_fn.has_been_called
+        assert number_of_steps == inputs["num_inference_steps"]
+
+    def test_stable_diffusion_low_cpu_mem_usage(self):
+        pipeline_id = "CompVis/stable-diffusion-v1-4"
+
+        start_time = time.time()
+        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
+        pipeline_low_cpu_mem_usage.to(torch_device)
+        low_cpu_mem_usage_time = time.time() - start_time
+
+        start_time = time.time()
+        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
+        normal_load_time = time.time() - start_time
+
+        assert 2 * low_cpu_mem_usage_time < normal_load_time
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        _ = pipe(**inputs)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.8 GB is allocated
+        assert mem_bytes < 2.8 * 10**9
+
+
+@nightly
+@require_torch_gpu
+class StableDiffusionPipelineNightlyTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+        generator = torch.Generator(device=generator_device).manual_seed(seed)
+        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
+        inputs = {
+            "prompt": "a photograph of an astronaut riding a horse",
+            "latents": latents,
+            "generator": generator,
+            "num_inference_steps": 50,
+            "guidance_scale": 7.5,
+            "output_type": "numpy",
+        }
+        return inputs
+
+    def test_stable_diffusion_1_4_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_1_5_pndm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_ddim(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_lms(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_euler(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3
+
+    def test_stable_diffusion_dpm(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
+        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 25
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
+            "/stable_diffusion_text2img/stable_diffusion_1_4_dpm_multi.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 1e-3

From bf3964c5652dfd7ae4147b0ed55b97f723ba641c Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 20 Feb 2023 18:43:05 +0100
Subject: [PATCH 11/66] clean-up: cpu offload vocoder instead of safety checker

---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 51f29d56a9d6..073978f42267 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -159,7 +159,7 @@ def disable_vae_slicing(self):
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        text_encoder, vae and vocoder have their state dicts saved to CPU and then are moved to a
         `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
         """
         if is_accelerate_available():
@@ -169,7 +169,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.vocoder]:
             cpu_offload(cpu_offloaded_model, device)
 
     @property

From f200e8030f03be35c0bc7e1fd5f17ce8568b48c5 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 13:58:27 +0100
Subject: [PATCH 12/66] feat: adapt tests to audioldm

---
 .../pipelines/audioldm/pipeline_audioldm.py   |  15 +-
 tests/pipelines/audioldm/test_audioldm.py     | 854 +++++++++---------
 2 files changed, 420 insertions(+), 449 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 073978f42267..85e6c6ac072a 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -335,12 +335,9 @@ def mel_spectrogram_to_waveform(self, mel_spectrogram):
         if mel_spectrogram.dim() == 4:
             mel_spectrogram = mel_spectrogram.squeeze(1)
 
-        mel_spectrogram = mel_spectrogram.squeeze(1)
-        # TODO: our transformers implementation can't handle batching! Squeeze tensor from [bsz, seq_len, mel_bins] to [seq_len, mel_bins]
-        mel_spectrogram = mel_spectrogram.squeeze(0)
         waveform = self.vocoder(mel_spectrogram)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.cpu().detach().numpy()
+        waveform = waveform.cpu().detach()
         return waveform
 
     def prepare_extra_step_kwargs(self, generator, eta):
@@ -443,6 +440,7 @@ def __call__(
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
+        output_type: Optional[str] = "np",
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -496,6 +494,10 @@ def __call__(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generate image. Choose between:
+                - `"np"`: Return Numpy `np.ndarray` objects.
+                - `"pt"`: Return PyTorch `torch.Tensor` objects.
 
         Examples:
 
@@ -506,7 +508,7 @@ def __call__(
         """
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor // 8
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
@@ -592,6 +594,9 @@ def __call__(
 
         audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
 
+        if output_type == "np":
+            audio = audio.numpy()
+
         if not return_dict:
             return (audio,)
 
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 02774d69dc29..099a5903e044 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 HuggingFace Inc.
+# Copyright 2023 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,15 +15,16 @@
 
 
 import gc
-import tempfile
 import time
 import unittest
 
 import numpy as np
 import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+import torch.nn.functional as F
+from transformers import ClapTextConfig, ClapTextModelWithProjection, RobertaTokenizer, SpeechT5HifiGan, SpeechT5HifiGanConfig
 
 from diffusers import (
+    AudioLDMPipeline,
     AutoencoderKL,
     DDIMScheduler,
     DPMSolverMultistepScheduler,
@@ -31,26 +32,24 @@
     EulerDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
+    UNet2DModel,
     logging,
 )
 from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
 
-from ...models.test_models_unet_2d_condition import create_lora_layers
 from ...test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPipeline
+class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = AudioLDMPipeline
 
     def get_dummy_components(self):
         torch.manual_seed(0)
-        unet = UNet2DConditionModel(
+        unet = UNet2DModel(
             block_out_channels=(32, 64),
             layers_per_block=2,
             sample_size=32,
@@ -59,6 +58,8 @@ def get_dummy_components(self):
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=32,
+            extra_film_condition_dim=32,
+            extra_film_use_concat=True,
         )
         scheduler = DDIMScheduler(
             beta_start=0.00085,
@@ -70,14 +71,14 @@ def get_dummy_components(self):
         torch.manual_seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
+            in_channels=1,
+            out_channels=1,
             down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
             up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
             latent_channels=4,
         )
         torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
+        text_encoder_config = ClapTextConfig(
             bos_token_id=0,
             eos_token_id=2,
             hidden_size=32,
@@ -87,9 +88,23 @@ def get_dummy_components(self):
             num_hidden_layers=5,
             pad_token_id=1,
             vocab_size=1000,
+            projection_dim=32,
         )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        text_encoder = ClapTextModelWithProjection(text_encoder_config)
+        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
+
+        vocoder_config = SpeechT5HifiGanConfig(
+        model_in_dim=8,
+        sampling_rate=16000,
+        upsample_initial_channel=16,
+        upsample_rates=[2, 2],
+        upsample_kernel_sizes=[4, 4],
+        resblock_kernel_sizes=[3, 7],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
+        normalize_before=False,
+        )
+
+        vocoder = SpeechT5HifiGan(vocoder_config)
 
         components = {
             "unet": unet,
@@ -97,8 +112,7 @@ def get_dummy_components(self):
             "vae": vae,
             "text_encoder": text_encoder,
             "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
+            "vocoder": vocoder,
         }
         return components
 
@@ -108,109 +122,80 @@ def get_dummy_inputs(self, device, seed=0):
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
         inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
+            "prompt": "A hammer hitting a wooden surface",
             "generator": generator,
             "num_inference_steps": 2,
             "guidance_scale": 6.0,
-            "output_type": "numpy",
         }
         return inputs
 
-    def test_stable_diffusion_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5643, 0.6017, 0.4799, 0.5267, 0.5584, 0.4641, 0.5159, 0.4963, 0.4791])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_lora(self):
+    def test_audioldm_ddim(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
-        # forward 1
         inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
+        output = audioldm_pipe(**inputs)
+        audio = output.audios[0]
 
-        # set lora layers
-        lora_attn_procs = create_lora_layers(sd_pipe.unet)
-        sd_pipe.unet.set_attn_processor(lora_attn_procs)
-        sd_pipe = sd_pipe.to(torch_device)
+        assert audio.ndim == 1
+        assert len(audio) == 256
 
-        # forward 2
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})
-        image = output.images
-        image_slice_1 = image[0, -3:, -3:, -1]
-
-        # forward 3
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})
-        image = output.images
-        image_slice_2 = image[0, -3:, -3:, -1]
+        audio_slice = audio[:10]
+        expected_slice = np.array([-0.0050,  0.0050, -0.0060,  0.0033, -0.0026,  0.0033, -0.0027, 0.0033, -0.0028,  0.0033])
 
-        assert np.abs(image_slice - image_slice_1).max() < 1e-2
-        assert np.abs(image_slice - image_slice_2).max() > 1e-2
+        assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
-    def test_stable_diffusion_prompt_embeds(self):
+    def test_audioldm_prompt_embeds(self):
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(torch_device)
         inputs["prompt"] = 3 * [inputs["prompt"]]
 
         # forward
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
+        output = audioldm_pipe(**inputs)
+        audio_1 = output.audios[0]
 
         inputs = self.get_dummy_inputs(torch_device)
         prompt = 3 * [inputs.pop("prompt")]
 
-        text_inputs = sd_pipe.tokenizer(
+        text_inputs = audioldm_pipe.tokenizer(
             prompt,
             padding="max_length",
-            max_length=sd_pipe.tokenizer.model_max_length,
+            max_length=audioldm_pipe.tokenizer.model_max_length,
             truncation=True,
             return_tensors="pt",
         )
         text_inputs = text_inputs["input_ids"].to(torch_device)
 
-        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
+        prompt_embeds = audioldm_pipe.text_encoder(
+            text_inputs,
+        )
+        prompt_embeds = prompt_embeds.text_embeds
+        # additional L_2 normalization over each hidden-state
+        prompt_embeds = F.normalize(prompt_embeds, dim=-1)
 
         inputs["prompt_embeds"] = prompt_embeds
 
         # forward
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
+        output = audioldm_pipe(**inputs)
+        audio_2 = output.audios[0]
 
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+        assert np.abs(audio_1 - audio_2).max() < 1e-4
 
-    def test_stable_diffusion_negative_prompt_embeds(self):
+    def test_audioldm_negative_prompt_embeds(self):
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(torch_device)
         negative_prompt = 3 * ["this is a negative prompt"]
@@ -218,342 +203,374 @@ def test_stable_diffusion_negative_prompt_embeds(self):
         inputs["prompt"] = 3 * [inputs["prompt"]]
 
         # forward
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
+        output = audioldm_pipe(**inputs)
+        audio_1 = output.audios[0]
 
         inputs = self.get_dummy_inputs(torch_device)
         prompt = 3 * [inputs.pop("prompt")]
 
         embeds = []
         for p in [prompt, negative_prompt]:
-            text_inputs = sd_pipe.tokenizer(
+            text_inputs = audioldm_pipe.tokenizer(
                 p,
                 padding="max_length",
-                max_length=sd_pipe.tokenizer.model_max_length,
+                max_length=audioldm_pipe.tokenizer.model_max_length,
                 truncation=True,
                 return_tensors="pt",
             )
             text_inputs = text_inputs["input_ids"].to(torch_device)
 
-            embeds.append(sd_pipe.text_encoder(text_inputs)[0])
+            text_embeds = audioldm_pipe.text_encoder(
+                text_inputs,
+            )
+            text_embeds = text_embeds.text_embeds
+            # additional L_2 normalization over each hidden-state
+            text_embeds = F.normalize(text_embeds, dim=-1)
+
+            embeds.append(text_embeds)
 
         inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
 
         # forward
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
+        output = audioldm_pipe(**inputs)
+        audio_2 = output.audios[0]
 
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+        assert np.abs(audio_1 - audio_2).max() < 1e-4
 
-    def test_stable_diffusion_ddim_factor_8(self):
+    def test_audioldm_ddim_factor_8(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs, height=136, width=136)
-        image = output.images
+        output = audioldm_pipe(**inputs, height=136)  # width has to stay fixed for the vocoder
+        audio = output.audios[0]
 
-        image_slice = image[0, -3:, -3:, -1]
+        assert audio.ndim == 1
+        assert len(audio) == 544
 
-        assert image.shape == (1, 136, 136, 3)
-        expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269])
+        audio_slice = audio[-10:]
+        expected_slice = np.array([-0.0029,  0.0036, -0.0027,  0.0032, -0.0029,  0.0034, -0.0028, 0.0073,  0.0039,  0.0058])
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
-    def test_stable_diffusion_pndm(self):
+    def test_audioldm_pndm(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
+        output = audioldm_pipe(**inputs)
+        audio = output.audios[0]
 
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5094, 0.5674, 0.4667, 0.5125, 0.5696, 0.4674, 0.5277, 0.4964, 0.4945])
+        assert audio.ndim == 1
+        assert len(audio) == 256
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        audio_slice = audio[:10]
+        expected_slice = np.array([-0.0051,  0.0050, -0.0060,  0.0034, -0.0026,  0.0033, -0.0027, 0.0033, -0.0028,  0.0032])
 
-    def test_stable_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
-        )
-        assert isinstance(pipe, StableDiffusionPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-        assert pipe.safety_checker is None
-
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-        # check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+        assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
-        # sanity check that the pipeline still works
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-    def test_stable_diffusion_k_lms(self):
+    def test_audioldm_k_lms(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.47082293033599854,
-                0.5371589064598083,
-                0.4562119245529175,
-                0.5220914483070374,
-                0.5733777284622192,
-                0.4795039892196655,
-                0.5465868711471558,
-                0.5074326395988464,
-                0.5042197108268738,
-            ]
-        )
+        output = audioldm_pipe(**inputs)
+        audio = output.audios[0]
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert audio.ndim == 1
+        assert len(audio) == 256
 
-    def test_stable_diffusion_k_euler_ancestral(self):
+        audio_slice = audio[:10]
+        expected_slice = np.array([-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032])
+
+        assert np.abs(audio_slice - expected_slice).max() < 1e-3
+
+    def test_audioldm_k_euler_ancestral(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.4707113206386566,
-                0.5372191071510315,
-                0.4563021957874298,
-                0.5220003724098206,
-                0.5734264850616455,
-                0.4794946610927582,
-                0.5463782548904419,
-                0.5074145197868347,
-                0.504422664642334,
-            ]
-        )
+        output = audioldm_pipe(**inputs)
+        audio = output.audios[0]
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert audio.ndim == 1
+        assert len(audio) == 256
 
-    def test_stable_diffusion_k_euler(self):
+        audio_slice = audio[:10]
+        expected_slice = np.array([-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032])
+
+        assert np.abs(audio_slice - expected_slice).max() < 1e-3
+
+    def test_audioldm_k_euler(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
 
         components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe.scheduler = EulerDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.47082313895225525,
-                0.5371587872505188,
-                0.4562119245529175,
-                0.5220913887023926,
-                0.5733776688575745,
-                0.47950395941734314,
-                0.546586811542511,
-                0.5074326992034912,
-                0.5042197108268738,
-            ]
-        )
+        output = audioldm_pipe(**inputs)
+        audio = output.audios[0]
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert audio.ndim == 1
+        assert len(audio) == 256
 
-    def test_stable_diffusion_vae_slicing(self):
+        audio_slice = audio[:10]
+        expected_slice = np.array([-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032])
+
+        assert np.abs(audio_slice - expected_slice).max() < 1e-3
+
+    def test_audioldm_vae_slicing(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
         components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         image_count = 4
 
         inputs = self.get_dummy_inputs(device)
         inputs["prompt"] = [inputs["prompt"]] * image_count
-        output_1 = sd_pipe(**inputs)
+        output_1 = audioldm_pipe(**inputs)
 
         # make sure sliced vae decode yields the same result
-        sd_pipe.enable_vae_slicing()
+        audioldm_pipe.enable_vae_slicing()
         inputs = self.get_dummy_inputs(device)
         inputs["prompt"] = [inputs["prompt"]] * image_count
-        output_2 = sd_pipe(**inputs)
+        output_2 = audioldm_pipe(**inputs)
 
-        # there is a small discrepancy at image borders vs. full batch decode
-        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3
+        # there is a small discrepancy at spectrogram borders vs. full batch decode
+        assert np.abs(output_2.audios - output_1.audios).max() < 1e-4
 
-    def test_stable_diffusion_negative_prompt(self):
+    def test_audioldm_negative_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
         components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(device)
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.5108221173286438,
-                0.5688379406929016,
-                0.4685141146183014,
-                0.5098261833190918,
-                0.5657756328582764,
-                0.4631010890007019,
-                0.5226285457611084,
-                0.49129390716552734,
-                0.4899061322212219,
-            ]
-        )
+        negative_prompt = "egg cracking"
+        output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
+        audio = output.audios[0]
+
+        assert audio.ndim == 1
+        assert len(audio) == 256
+
+        audio_slice = audio[:10]
+        expected_slice = np.array([-0.0051,  0.0050, -0.0060,  0.0034, -0.0026,  0.0033, -0.0027,
+        0.0033, -0.0028,  0.0032])
 
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
-    def test_stable_diffusion_num_images_per_prompt(self):
+    def test_audioldm_num_waveforms_per_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
         components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
-        prompt = "A painting of a squirrel eating a burger"
+        prompt = "A hammer hitting a wooden surface"
 
-        # test num_images_per_prompt=1 (default)
-        images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
+        # test num_waveforms_per_prompt=1 (default)
+        audios = audioldm_pipe(prompt, num_inference_steps=2).audios
 
-        assert images.shape == (1, 64, 64, 3)
+        assert audios.shape == (1, 256)
 
-        # test num_images_per_prompt=1 (default) for batch of prompts
+        # test num_waveforms_per_prompt=1 (default) for batch of prompts
         batch_size = 2
-        images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
+        audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
 
-        assert images.shape == (batch_size, 64, 64, 3)
+        assert audios.shape == (batch_size, 256)
 
-        # test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
+        # test num_waveforms_per_prompt for single prompt
+        num_waveforms_per_prompt = 2
+        audios = audioldm_pipe(
+            prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
+        ).audios
 
-        assert images.shape == (num_images_per_prompt, 64, 64, 3)
+        assert audios.shape == (num_waveforms_per_prompt, 256)
 
-        # test num_images_per_prompt for batch of prompts
+        # test num_waveforms_per_prompt for batch of prompts
         batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
+        audios = audioldm_pipe(
+            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
+        ).audios
 
-        assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
+        assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
 
-    def test_stable_diffusion_long_prompt(self):
+    def test_audioldm_long_prompt(self):
         components = self.get_dummy_components()
         components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         do_classifier_free_guidance = True
         negative_prompt = None
         num_images_per_prompt = 1
-        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+        logger = logging.get_logger("diffusers.pipelines.audioldm.pipeline_audioldm")
 
         prompt = 25 * "@"
         with CaptureLogger(logger) as cap_logger_3:
-            text_embeddings_3 = sd_pipe._encode_prompt(
+            text_embeddings_3 = audioldm_pipe._encode_prompt(
                 prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
             )
 
         prompt = 100 * "@"
         with CaptureLogger(logger) as cap_logger:
-            text_embeddings = sd_pipe._encode_prompt(
+            text_embeddings = audioldm_pipe._encode_prompt(
                 prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
             )
 
         negative_prompt = "Hello"
         with CaptureLogger(logger) as cap_logger_2:
-            text_embeddings_2 = sd_pipe._encode_prompt(
+            text_embeddings_2 = audioldm_pipe._encode_prompt(
                 prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
             )
 
         assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
-        assert text_embeddings.shape[1] == 77
+
+        assert text_embeddings.shape[1] == 32
 
         assert cap_logger.out == cap_logger_2.out
         # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
         assert cap_logger.out.count("@") == 25
         assert cap_logger_3.out == ""
 
-    def test_stable_diffusion_height_width_opt(self):
+    def test_audioldm_height_opt(self):
         components = self.get_dummy_components()
         components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
-        prompt = "hey"
+        prompt = ["hey"]
 
-        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (64, 64)
+        output = audioldm_pipe(prompt, num_inference_steps=1)
+        audio_shape = output.audios.shape
+        assert audio_shape == (1, 256)
 
-        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (96, 96)
+        output = audioldm_pipe(prompt, num_inference_steps=1, height=96, width=8)
+        audio_shape = output.audios.shape
+        assert audio_shape == (1, 384)
 
-        config = dict(sd_pipe.unet.config)
+        config = dict(audioldm_pipe.unet.config)
         config["sample_size"] = 96
-        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
-        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (192, 192)
+        audioldm_pipe.unet = UNet2DModel.from_config(config).to(torch_device)
+        output = audioldm_pipe(prompt, num_inference_steps=1, width=8)  # need to keep width fixed for vocoder
+        audio_shape = output.audios.shape
+        assert audio_shape == (1, 768)
+
+    def test_audioldm_width_opt(self):
+        components = self.get_dummy_components()
+        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        prompt = ["hey"]
+
+        width = audioldm_pipe.vocoder.config.model_in_dim
+
+        output = audioldm_pipe(prompt, num_inference_steps=1, width=width)
+        audio_shape = output.audios.shape
+        assert audio_shape == (1, 256)
+
+        config = audioldm_pipe.vocoder.config
+        config.model_in_dim = width * 2
+        audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
+        output = audioldm_pipe(prompt, num_inference_steps=1, width=width*2)
+        audio_shape = output.audios.shape
+        # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
+        assert audio_shape == (1, 256)
+
+    def test_attention_slicing_forward_pass(self):
+        # override this test since we want to compare 1-d audio waveforms (not 3d pixel arrays)
+        if not self.test_attention_slicing:
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # Warmup pass when using mps (see #372)
+        if torch_device == "mps":
+            _ = pipe(**self.get_dummy_inputs(torch_device))
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_slicing = pipe(**inputs).audios
+
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_slicing = pipe(**inputs).audios
+
+        max_diff = np.abs(output_with_slicing - output_without_slicing).max()
+        self.assertLess(max_diff, 1e-3, "Attention slicing should not affect the inference results")
+
+    def test_inference_batch_single_identical(self):
+        # override this test since we want to compare 1-d audio waveforms (not 3d pixel arrays)
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDMPipeline(**components)
+        audioldm_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        # run single-sample inference
+        inputs["generator"] = self.get_generator(0)
+        output = audioldm_pipe(**inputs).audios
+
+        batch_size = 3
+        batched_inputs = {}
+
+        # make unequal batch sizes
+        batched_inputs["prompt"] = [inputs["prompt"][: len(inputs["prompt"]) // i] for i in range(1, batch_size + 1)]
+        # make last batch super long
+        batched_inputs["prompt"][-1] = 2000 * "very long"
+        # set the generator
+        batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+        # duplicate any remaining inputs
+        for named_input in inputs:
+            if named_input not in batched_inputs:
+                batched_inputs[named_input] = inputs[named_input]
+
+        # run batched inference
+        output_batch = audioldm_pipe(**batched_inputs).audios
+        assert output_batch.shape[0] == batch_size
+
+        max_diff = np.abs(output_batch[0] - output[0]).max()
+        assert max_diff < 1e-4
 
 
 @slow
 @require_torch_gpu
-class StableDiffusionPipelineSlowTests(unittest.TestCase):
+class AudioLDMPipelineSlowTests(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         gc.collect()
@@ -561,96 +578,79 @@ def tearDown(self):
 
     def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
         generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
         latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
         inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
+            "prompt": "A hammer hitting a wooden surface",
             "latents": latents,
             "generator": generator,
             "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
+            "guidance_scale": 2.5,
         }
         return inputs
 
-    def test_stable_diffusion_1_1_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+    def test_audioldm_ddim(self):
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio")
+        audioldm_pipe.scheduler = DDIMScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
+        audio = audioldm_pipe(**inputs).audios[0]
+        
+        assert audio.ndim == 1
+        assert len(audio) == 81952
+
+        audio_slice = audio[3880:3890]
+        expected_slice = np.array([-0.0574,  0.2462,  0.3955,  0.4213,  0.3901,  0.3770,  0.2762, 0.0206, -0.2208, -0.3282])
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-3
 
-    def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+    def test_audioldm_lms(self):
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio")
+        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        audio = audioldm_pipe(**inputs).audios[0]
 
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
+        assert audio.ndim == 1
+        assert len(audio) == 81952
 
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
+        audio_slice = audio[27780:27790]
+        expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189,  0.0569,  0.1373,  0.1883, 0.2886,  0.3297,  0.2212])
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-3
 
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+    def test_audioldm_dpm(self):
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio")
+        audioldm_pipe.scheduler = DPMSolverMultistepScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
+        audio = audioldm_pipe(**inputs).audios[0]
 
-    def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        assert audio.ndim == 1
+        assert len(audio) == 81952
 
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
+        audio_slice = audio[69310:69320]
+        expected_slice = np.array([ 0.1842,  0.2411,  0.3127,  0.3069,  0.2287,  0.0948, -0.0071, -0.041 , -0.1293, -0.2075])
+        max_diff = np.abs(expected_slice - audio_slice).max()
+        assert max_diff < 1e-3
 
-    def test_stable_diffusion_attention_slicing(self):
+    def test_audioldm_attention_slicing(self):
+        # TODO(SG): fix or remove. This test yields the same memory for with / without attn slicing
         torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         # enable attention slicing
-        pipe.enable_attention_slicing()
+        pipe.enable_attention_slicing(slice_size="max")
         inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image_sliced = pipe(**inputs).images
+        audios_sliced = pipe(**inputs).audios
 
         mem_bytes = torch.cuda.max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
@@ -660,16 +660,16 @@ def test_stable_diffusion_attention_slicing(self):
         # disable slicing
         pipe.disable_attention_slicing()
         inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image = pipe(**inputs).images
+        audios = pipe(**inputs).audios
 
         # make sure that more than 3.75 GB is allocated
         mem_bytes = torch.cuda.max_memory_allocated()
         assert mem_bytes > 3.75 * 10**9
-        assert np.abs(image_sliced - image).max() < 1e-3
+        assert np.abs(audios_sliced - audios).max() < 1e-3
 
-    def test_stable_diffusion_vae_slicing(self):
+    def test_audioldm_vae_slicing(self):
         torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -679,47 +679,47 @@ def test_stable_diffusion_vae_slicing(self):
         inputs = self.get_inputs(torch_device, dtype=torch.float16)
         inputs["prompt"] = [inputs["prompt"]] * 4
         inputs["latents"] = torch.cat([inputs["latents"]] * 4)
-        image_sliced = pipe(**inputs).images
+        audio_sliced = pipe(**inputs).audios
 
         mem_bytes = torch.cuda.max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 4 GB is allocated
-        assert mem_bytes < 4e9
+        # make sure that less than 1.1 GB is allocated
+        assert mem_bytes < 1.1 * 10**9
 
         # disable vae slicing
         pipe.disable_vae_slicing()
         inputs = self.get_inputs(torch_device, dtype=torch.float16)
         inputs["prompt"] = [inputs["prompt"]] * 4
         inputs["latents"] = torch.cat([inputs["latents"]] * 4)
-        image = pipe(**inputs).images
+        audio = pipe(**inputs).audios
 
-        # make sure that more than 4 GB is allocated
+        # make sure that more than 1.1 GB is allocated
         mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 4e9
-        # There is a small discrepancy at the image borders vs. a fully batched version.
-        assert np.abs(image_sliced - image).max() < 1e-2
+        assert mem_bytes > 1.1 * 10**9
+        # There is a small discrepancy at the spectrogram borders vs. a fully batched version.
+        assert np.abs(audio_sliced - audio).max() < 1e-2
 
-    def test_stable_diffusion_fp16_vs_autocast(self):
+    def test_audioldm_fp16_vs_autocast(self):
         # this test makes sure that the original model with autocast
         # and the new model with fp16 yield the same result
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image_fp16 = pipe(**inputs).images
+        audio_fp16 = pipe(**inputs).audios
 
         with torch.autocast(torch_device):
             inputs = self.get_inputs(torch_device)
-            image_autocast = pipe(**inputs).images
+            audio_autocast = pipe(**inputs).audios
 
         # Make sure results are close enough
-        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
+        diff = np.abs(audio_fp16.flatten() - audio_autocast.flatten())
         # They ARE different since ops are not run always at the same precision
         # however, they should be extremely close.
-        assert diff.mean() < 2e-2
+        assert diff.mean() < 1e-3
 
-    def test_stable_diffusion_intermediate_state(self):
+    def test_audioldm_intermediate_state(self):
         number_of_steps = 0
 
         def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
@@ -728,26 +728,26 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
             number_of_steps += 1
             if step == 1:
                 latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
+                assert latents.shape == (1, 8, 128, 16)
                 latents_slice = latents[0, -3:, -3:, -1]
                 expected_slice = np.array(
-                    [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
+                    [-0.6730, -0.9062, 1.0400, 0.4220, -0.9785, 1.817, 0.1906, -1.3430, 1.3330]
                 )
 
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
+                assert latents.shape == (1, 8, 128, 16)
                 latents_slice = latents[0, -3:, -3:, -1]
                 expected_slice = np.array(
-                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
+                    [-0.6763, -0.9062, 1.0520, 0.4200, -0.9750, 1.8220, 0.1879, -1.3490, 1.3190]
                 )
 
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
 
         callback_fn.has_been_called = False
 
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -757,26 +757,26 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
         assert callback_fn.has_been_called
         assert number_of_steps == inputs["num_inference_steps"]
 
-    def test_stable_diffusion_low_cpu_mem_usage(self):
-        pipeline_id = "CompVis/stable-diffusion-v1-4"
+    def test_audioldm_low_cpu_mem_usage(self):
+        pipeline_id = "sanchit-gandhi/audioldm-text-to-audio"
 
         start_time = time.time()
-        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
+        pipeline_low_cpu_mem_usage = AudioLDMPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
         pipeline_low_cpu_mem_usage.to(torch_device)
         low_cpu_mem_usage_time = time.time() - start_time
 
         start_time = time.time()
-        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
+        _ = AudioLDMPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
         normal_load_time = time.time() - start_time
 
         assert 2 * low_cpu_mem_usage_time < normal_load_time
 
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+    def test_audioldm_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.empty_cache()
         torch.cuda.reset_max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
 
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing(1)
@@ -792,7 +792,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
 
 @nightly
 @require_torch_gpu
-class StableDiffusionPipelineNightlyTests(unittest.TestCase):
+class AudioLDMPipelineNightlyTests(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         gc.collect()
@@ -800,103 +800,69 @@ def tearDown(self):
 
     def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
         generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
+        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
         latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
         inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
+            "prompt": "A hammer hitting a wooden table",
             "latents": latents,
             "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
+            "num_inference_steps": 5,
+            "guidance_scale": 2.5,
         }
         return inputs
 
-    def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_1_5_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
+    def test_audioldm_ddim(self):
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio").to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
+        audios = audioldm_pipe(**inputs).audios[0]
 
-        expected_image = load_numpy(
+        expected_audios = load_numpy(
             "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
         )
-        max_diff = np.abs(expected_image - image).max()
+        max_diff = np.abs(expected_audios - audios).max()
         assert max_diff < 1e-3
 
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
+    def test_audioldm_lms(self):
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio").to(torch_device)
+        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
+        audios = audioldm_pipe(**inputs).audios[0]
 
-        expected_image = load_numpy(
+        expected_audios = load_numpy(
             "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
         )
-        max_diff = np.abs(expected_image - image).max()
+        max_diff = np.abs(expected_audios - audios).max()
         assert max_diff < 1e-3
 
-    def test_stable_diffusion_euler(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
+    def test_audioldm_euler(self):
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio").to(torch_device)
+        audioldm_pipe.scheduler = EulerDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
+        audios = audioldm_pipe(**inputs).audios[0]
 
-        expected_image = load_numpy(
+        expected_audios = load_numpy(
             "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
         )
-        max_diff = np.abs(expected_image - image).max()
+        max_diff = np.abs(expected_audios - audios).max()
         assert max_diff < 1e-3
 
-    def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
+    def test_audioldm_dpm(self):
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio").to(torch_device)
+        audioldm_pipe.scheduler = DPMSolverMultistepScheduler.from_config(audioldm_pipe.scheduler.config)
+        audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
         inputs["num_inference_steps"] = 25
-        image = sd_pipe(**inputs).images[0]
+        audios = audioldm_pipe(**inputs).audios[0]
 
-        expected_image = load_numpy(
+        expected_audios = load_numpy(
             "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_dpm_multi.npy"
         )
-        max_diff = np.abs(expected_image - image).max()
+        max_diff = np.abs(expected_audios - audios).max()
         assert max_diff < 1e-3

From dd04c2e0e5435906530124bc6e50e3fac9bf1ff6 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 15:25:04 +0100
Subject: [PATCH 13/66] feat: add docs

---
 docs/source/en/api/pipelines/audioldm.mdx | 78 +++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 docs/source/en/api/pipelines/audioldm.mdx

diff --git a/docs/source/en/api/pipelines/audioldm.mdx b/docs/source/en/api/pipelines/audioldm.mdx
new file mode 100644
index 000000000000..f3dae71627fa
--- /dev/null
+++ b/docs/source/en/api/pipelines/audioldm.mdx
@@ -0,0 +1,78 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AudioLDM
+
+## Overview
+
+AudioLDM was proposed in [AudioLDM: Text-to-Audio Generation with Latent Diffusion Models](https://arxiv.org/abs/2301.12503) by Haohe Liu et al.
+
+Inspired by [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview), AudioLDM
+is a text-to-audio _latent diffusion model (LDM)_ that learns continuous audio representations from [CLAP](https://huggingface.co/docs/transformers/main/model_doc/clap)
+latents. AudioLDM takes a text prompt as input and predicts the corresponding audio. It can generate text-conditional
+sound effects, human speech and music.
+
+This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi). The original codebase can be found [here](https://github.com/haoheliu/AudioLDM).
+
+## Text-to-Audio
+
+- *Text-to-Audio* [sanchit-gandhi/audioldm-text-to-audio](https://huggingface.co/sanchit-gandhi/audioldm-text-to-audio) with [`AudioLDMPipeline`]
+
+```python
+from diffusers import AudioLDMPipeline
+import torch
+
+repo_id = "sanchit-gandhi/audioldm-text-to-audio"
+pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+
+prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+audio = pipe(prompt, num_inference_steps=10, height=512).audios[0]
+
+# save the audio sample as a .wav file
+import scipy
+scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
+```
+
+### Tips
+
+* Try to provide descriptive text inputs to AudioLDM. You can use adjectives to describe the sound (e.g. "high quality" or "clear") and make the prompt context specific (e.g., "water stream in a forest" instead of "stream").
+* It's best to use general terms like 'cat' or 'dog' instead of specific names or abstract objects that the model may not be familiar with.
+* The _quality_ of the predicted audio sample can be controlled by the `num_inference_steps` argument: higher steps give higher quality audio at the expense of slower inference.
+* The _length_ of the predicted audio sample can be controlled by varying the `height` argument (which controls the height of the spectrogram prediction).
+
+### How to load and use different schedulers
+
+The AudioLDM pipeline uses [`DDIMScheduler`] scheduler by default. But `diffusers` provides many other schedulers
+that can be used with the AudioLDM pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], 
+[`EulerAncestralDiscreteScheduler`] etc. We recommend using the [`DPMSolverMultistepScheduler`] as it's currently the fastest
+scheduler there is.
+
+To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`]
+method, or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the
+[`DPMSolverMultistepScheduler`], you can do the following:
+
+```python
+>>> from diffusers import AudioLDMPipeline, DPMSolverMultistepScheduler
+
+>>> pipeline = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio")
+>>> pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+
+>>> # or
+>>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", subfolder="scheduler")
+>>> pipeline = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", scheduler=dpm_scheduler)
+```
+
+## AudioLDMPipeline
+[[autodoc]] AudioLDMPipeline
+	- all
+	- __call__

From 1c26ca9492107da9abee0b0ac192e5bcc5438c59 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 15:25:22 +0100
Subject: [PATCH 14/66] clean-up: amend pipeline docstrings

---
 src/diffusers/pipelines/audioldm/convert_from_ckpt.py | 5 ++---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 7 ++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index dd58f5c3dac5..36acbd30c4be 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -240,7 +240,7 @@ def create_unet_diffusers_config(original_config, image_size: int):
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=True,  # TODO: hacky - what are we doing re cross attention?
+        cross_attention_dim=True,  # TODO(SG): hacky - what are we doing re cross attention?
         extra_film_condition_dim=extra_film_condition_dim,
         extra_film_use_concat=extra_film_use_concat,
     )
@@ -686,8 +686,7 @@ def convert_hifigan_checkpoint(checkpoint, config):
         vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
 
     if not config.normalize_before:
-        # if we don't normalize before these variables are unused, so we set them to arbitrary values
-        # TODO: fix this in transformers
+        # if we don't set normalize_before then these variables are unused, so we set them to their initialised values
         vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
         vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
 
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 85e6c6ac072a..d71335692f68 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -450,9 +450,10 @@ def __call__(
                 The prompt or prompts to guide the audio generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated audio.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated audio.
+                The height in pixels of the generated spectrogram. Using a larger height results in a longer spectrogram
+                and thus longer audio sample.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor // 8):
+                The width in pixels of the generated spectrogram.
             num_inference_steps (`int`, *optional*, defaults to 200):
                 The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
                 expense of slower inference.

From d32bd7f4265128b3504b873a921d9f27403a282d Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 15:28:11 +0100
Subject: [PATCH 15/66] clean-up: make style

---
 docs/source/en/api/pipelines/audioldm.mdx     |   5 +-
 .../convert_original_audioldm_to_diffusers.py |   8 +-
 src/diffusers/models/unet_2d.py               |   3 +-
 .../pipelines/audioldm/convert_from_ckpt.py   | 163 +++++++++---------
 .../pipelines/audioldm/pipeline_audioldm.py   |  14 +-
 tests/pipelines/audioldm/test_audioldm.py     |  89 +++++-----
 6 files changed, 144 insertions(+), 138 deletions(-)

diff --git a/docs/source/en/api/pipelines/audioldm.mdx b/docs/source/en/api/pipelines/audioldm.mdx
index f3dae71627fa..9c8aed78faa2 100644
--- a/docs/source/en/api/pipelines/audioldm.mdx
+++ b/docs/source/en/api/pipelines/audioldm.mdx
@@ -40,6 +40,7 @@ audio = pipe(prompt, num_inference_steps=10, height=512).audios[0]
 
 # save the audio sample as a .wav file
 import scipy
+
 scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
 ```
 
@@ -68,7 +69,9 @@ method, or pass the `scheduler` argument to the `from_pretrained` method of the
 >>> pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
 
 >>> # or
->>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", subfolder="scheduler")
+>>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained(
+...     "sanchit-gandhi/audioldm-text-to-audio", subfolder="scheduler"
+... )
 >>> pipeline = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", scheduler=dpm_scheduler)
 ```
 
diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 9d0684cb27cc..279e7986b4a6 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -58,17 +58,13 @@
         "--image_size",
         default=None,
         type=int,
-        help=(
-            "The image size that the model was trained on."
-        ),
+        help=("The image size that the model was trained on."),
     )
     parser.add_argument(
         "--prediction_type",
         default=None,
         type=str,
-        help=(
-            "The prediction type that the model was trained on."
-        ),
+        help=("The prediction type that the model was trained on."),
     )
     parser.add_argument(
         "--extract_ema",
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index f8ec535964d6..112710d869f4 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -21,7 +21,7 @@
 from ..utils import BaseOutput
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
-from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block, UNetMidBlock2DCrossAttn
+from .unet_2d_blocks import UNetMidBlock2D, UNetMidBlock2DCrossAttn, get_down_block, get_up_block
 
 
 @dataclass
@@ -201,7 +201,6 @@ def __init__(
                 add_attention=add_attention,
             )
 
-
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         output_channel = reversed_block_out_channels[0]
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index 36acbd30c4be..ce21640e1405 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -17,6 +17,13 @@
 import re
 
 import torch
+from transformers import (
+    AutoTokenizer,
+    ClapTextConfig,
+    ClapTextModelWithProjection,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+)
 
 from diffusers import (
     AudioLDMPipeline,
@@ -30,13 +37,6 @@
     PNDMScheduler,
     UNet2DModel,
 )
-from transformers import (
-    AutoTokenizer,
-    ClapTextConfig,
-    ClapTextModelWithProjection,
-    SpeechT5HifiGan,
-    SpeechT5HifiGanConfig,
-)
 
 from ...utils import is_omegaconf_available, is_safetensors_available
 from ...utils.import_utils import BACKENDS_MAPPING
@@ -251,9 +251,8 @@ def create_unet_diffusers_config(original_config, image_size: int):
 # Adapted from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.create_vae_diffusers_config
 def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
     """
-    Creates a VAE config for diffusers based on the config of the original AudioLDM model.
-    Compared to the original Stable Diffusion conversion, this function passes a
-    *learnt* VAE scaling factor to the diffusers VAE.
+    Creates a VAE config for diffusers based on the config of the original AudioLDM model. Compared to the original
+    Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
     """
     vae_params = original_config.model.params.first_stage_config.params.ddconfig
     _ = original_config.model.params.first_stage_config.params.embed_dim
@@ -292,9 +291,8 @@ def create_diffusers_schedular(original_config):
 # Adapted from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.convert_ldm_unet_checkpoint
 def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
     """
-    Takes a state dict and a config, and returns a converted checkpoint. Compared to the
-    original Stable Diffusion conversion, this function additionally converts the learnt
-    film embedding linear layer.
+    Takes a state dict and a config, and returns a converted checkpoint. Compared to the original Stable Diffusion
+    conversion, this function additionally converts the learnt film embedding linear layer.
     """
 
     # extract state_dict for UNet
@@ -592,6 +590,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
 
 CLAP_EXPECTED_MISSING_KEYS = ["text_model.embeddings.token_type_ids"]
 
+
 def convert_open_clap_checkpoint(checkpoint):
     """
     Takes a state dict and returns a converted CLAP checkpoint.
@@ -649,6 +648,7 @@ def convert_open_clap_checkpoint(checkpoint):
 
     return new_checkpoint
 
+
 def create_transformers_vocoder_config(original_config):
     """
     Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
@@ -656,18 +656,21 @@ def create_transformers_vocoder_config(original_config):
     vocoder_params = original_config.model.params.vocoder_config.params
 
     config = dict(
-    model_in_dim = vocoder_params.num_mels,
-    sampling_rate = vocoder_params.sampling_rate,
-    upsample_initial_channel = vocoder_params.upsample_initial_channel,
-    upsample_rates = list(vocoder_params.upsample_rates),
-    upsample_kernel_sizes = list(vocoder_params.upsample_kernel_sizes),
-    resblock_kernel_sizes = list(vocoder_params.resblock_kernel_sizes),
-    resblock_dilation_sizes = [list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes],
-    normalize_before=False,
+        model_in_dim=vocoder_params.num_mels,
+        sampling_rate=vocoder_params.sampling_rate,
+        upsample_initial_channel=vocoder_params.upsample_initial_channel,
+        upsample_rates=list(vocoder_params.upsample_rates),
+        upsample_kernel_sizes=list(vocoder_params.upsample_kernel_sizes),
+        resblock_kernel_sizes=list(vocoder_params.resblock_kernel_sizes),
+        resblock_dilation_sizes=[
+            list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes
+        ],
+        normalize_before=False,
     )
 
     return config
 
+
 def convert_hifigan_checkpoint(checkpoint, config):
     """
     Takes a state dict and config, and returns a converted HiFiGAN vocoder checkpoint.
@@ -692,60 +695,61 @@ def convert_hifigan_checkpoint(checkpoint, config):
 
     return vocoder_state_dict
 
+
 # Adapted from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/audioldm/utils.py#L72-L73
 DEFAULT_CONFIG = {
-                "model": {
-                    "params": {
-                        "linear_start": 0.0015,
-                        "linear_end": 0.0195,
-                        "timesteps": 1000,
-                        "channels": 8,
-                        "scale_by_std": True,
-                        "unet_config": {
-                            "target": "audioldm.latent_diffusion.openaimodel.UNetModel",
-                            "params": {
-                                "extra_film_condition_dim": 512,
-                                "extra_film_use_concat": True,
-                                "in_channels": 8,
-                                "out_channels": 8,
-                                "model_channels": 128,
-                                "attention_resolutions": [8, 4, 2],
-                                "num_res_blocks": 2,
-                                "channel_mult": [1, 2, 3, 5],
-                                "num_head_channels": 32,
-                            },
-                        },
-                        "first_stage_config": {
-                            "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
-                            "params": {
-                                "embed_dim": 8,
-                                "ddconfig": {
-                                    "z_channels": 8,
-                                    "resolution": 256,
-                                    "in_channels": 1,
-                                    "out_ch": 1,
-                                    "ch": 128,
-                                    "ch_mult": [1, 2, 4],
-                                    "num_res_blocks": 2,
-                                },
-                            },
-                        },
-                        "vocoder_config": {
-                            "target": "audioldm.first_stage_model.vocoder",
-                            "params": {
-                                "upsample_rates": [5, 4, 2, 2, 2],
-                                "upsample_kernel_sizes": [16, 16, 8, 4, 4],
-                                "upsample_initial_channel": 1024,
-                                "resblock_kernel_sizes": [3, 7, 11],
-                                "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-                                "num_mels": 64,
-                                "sampling_rate": 16000,
-                            }
-
-                        }
+    "model": {
+        "params": {
+            "linear_start": 0.0015,
+            "linear_end": 0.0195,
+            "timesteps": 1000,
+            "channels": 8,
+            "scale_by_std": True,
+            "unet_config": {
+                "target": "audioldm.latent_diffusion.openaimodel.UNetModel",
+                "params": {
+                    "extra_film_condition_dim": 512,
+                    "extra_film_use_concat": True,
+                    "in_channels": 8,
+                    "out_channels": 8,
+                    "model_channels": 128,
+                    "attention_resolutions": [8, 4, 2],
+                    "num_res_blocks": 2,
+                    "channel_mult": [1, 2, 3, 5],
+                    "num_head_channels": 32,
+                },
+            },
+            "first_stage_config": {
+                "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
+                "params": {
+                    "embed_dim": 8,
+                    "ddconfig": {
+                        "z_channels": 8,
+                        "resolution": 256,
+                        "in_channels": 1,
+                        "out_ch": 1,
+                        "ch": 128,
+                        "ch_mult": [1, 2, 4],
+                        "num_res_blocks": 2,
                     },
                 },
-            }
+            },
+            "vocoder_config": {
+                "target": "audioldm.first_stage_model.vocoder",
+                "params": {
+                    "upsample_rates": [5, 4, 2, 2, 2],
+                    "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+                    "upsample_initial_channel": 1024,
+                    "resblock_kernel_sizes": [3, 7, 11],
+                    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    "num_mels": 64,
+                    "sampling_rate": 16000,
+                },
+            },
+        },
+    },
+}
+
 
 def load_pipeline_from_original_audioldm_ckpt(
     checkpoint_path: str,
@@ -765,22 +769,21 @@ def load_pipeline_from_original_audioldm_ckpt(
     global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
     recommended that you override the default values and/or supply an `original_config_file` wherever possible.
 
-    :param checkpoint_path: Path to `.ckpt` file.
-    :param original_config_file: Path to `.yaml` config file corresponding to the original architecture.
+    :param checkpoint_path: Path to `.ckpt` file. :param original_config_file: Path to `.yaml` config file
+    corresponding to the original architecture.
             If `None`, will be automatically instantiated based on default values.
-    :param image_size: The image size that the model was trained on. Use 512 for original AudioLDM checkpoints.
-    :param prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for original
+    :param image_size: The image size that the model was trained on. Use 512 for original AudioLDM checkpoints. :param
+    prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for original
             AudioLDM checkpoints.
     :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
             inferred.
     :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
             "euler-ancestral", "dpm", "ddim"]`.
     :param extract_ema: Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract
-            the EMA weights or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights
-            usually yield higher quality images for inference. Non-EMA weights are usually better to continue
-            fine-tuning.
-    :param device: The device to use. Pass `None` to determine automatically.
-    :param from_safetensors: If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors
+            the EMA weights or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually
+            yield higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
+    :param device: The device to use. Pass `None` to determine automatically. :param from_safetensors: If
+    `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors
             instead of PyTorch.
     :return: An AudioLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
     """
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index d71335692f68..e4f1fc299751 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -17,7 +17,6 @@
 
 import torch
 import torch.nn.functional as F
-
 from packaging import version
 from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
 
@@ -159,8 +158,8 @@ def disable_vae_slicing(self):
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and vocoder have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        text_encoder, vae and vocoder have their state dicts saved to CPU and then are moved to a `torch.device('meta')
+        and loaded to GPU only when their specific submodule has its `forward` method called.
         """
         if is_accelerate_available():
             from accelerate import cpu_offload
@@ -264,7 +263,10 @@ def _encode_prompt(
 
         prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
 
-        bs_embed, seq_len, = prompt_embeds.shape
+        (
+            bs_embed,
+            seq_len,
+        ) = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_waveforms_per_prompt)
         prompt_embeds = prompt_embeds.view(bs_embed * num_waveforms_per_prompt, seq_len)
@@ -450,8 +452,8 @@ def __call__(
                 The prompt or prompts to guide the audio generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
             height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated spectrogram. Using a larger height results in a longer spectrogram
-                and thus longer audio sample.
+                The height in pixels of the generated spectrogram. Using a larger height results in a longer
+                spectrogram and thus longer audio sample.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor // 8):
                 The width in pixels of the generated spectrogram.
             num_inference_steps (`int`, *optional*, defaults to 200):
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 099a5903e044..4f61b7b369b4 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -21,7 +21,13 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-from transformers import ClapTextConfig, ClapTextModelWithProjection, RobertaTokenizer, SpeechT5HifiGan, SpeechT5HifiGanConfig
+from transformers import (
+    ClapTextConfig,
+    ClapTextModelWithProjection,
+    RobertaTokenizer,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+)
 
 from diffusers import (
     AudioLDMPipeline,
@@ -94,14 +100,14 @@ def get_dummy_components(self):
         tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
 
         vocoder_config = SpeechT5HifiGanConfig(
-        model_in_dim=8,
-        sampling_rate=16000,
-        upsample_initial_channel=16,
-        upsample_rates=[2, 2],
-        upsample_kernel_sizes=[4, 4],
-        resblock_kernel_sizes=[3, 7],
-        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
-        normalize_before=False,
+            model_in_dim=8,
+            sampling_rate=16000,
+            upsample_initial_channel=16,
+            upsample_rates=[2, 2],
+            upsample_kernel_sizes=[4, 4],
+            resblock_kernel_sizes=[3, 7],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
+            normalize_before=False,
         )
 
         vocoder = SpeechT5HifiGan(vocoder_config)
@@ -145,7 +151,9 @@ def test_audioldm_ddim(self):
         assert len(audio) == 256
 
         audio_slice = audio[:10]
-        expected_slice = np.array([-0.0050,  0.0050, -0.0060,  0.0033, -0.0026,  0.0033, -0.0027, 0.0033, -0.0028,  0.0033])
+        expected_slice = np.array(
+            [-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033]
+        )
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
@@ -253,7 +261,7 @@ def test_audioldm_ddim_factor_8(self):
         assert len(audio) == 544
 
         audio_slice = audio[-10:]
-        expected_slice = np.array([-0.0029,  0.0036, -0.0027,  0.0032, -0.0029,  0.0034, -0.0028, 0.0073,  0.0039,  0.0058])
+        expected_slice = np.array([-0.0029, 0.0036, -0.0027, 0.0032, -0.0029, 0.0034, -0.0028, 0.0073, 0.0039, 0.0058])
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
@@ -273,7 +281,9 @@ def test_audioldm_pndm(self):
         assert len(audio) == 256
 
         audio_slice = audio[:10]
-        expected_slice = np.array([-0.0051,  0.0050, -0.0060,  0.0034, -0.0026,  0.0033, -0.0027, 0.0033, -0.0028,  0.0032])
+        expected_slice = np.array(
+            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
+        )
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
@@ -294,7 +304,9 @@ def test_audioldm_k_lms(self):
         assert len(audio) == 256
 
         audio_slice = audio[:10]
-        expected_slice = np.array([-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032])
+        expected_slice = np.array(
+            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
+        )
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
@@ -315,7 +327,9 @@ def test_audioldm_k_euler_ancestral(self):
         assert len(audio) == 256
 
         audio_slice = audio[:10]
-        expected_slice = np.array([-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032])
+        expected_slice = np.array(
+            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
+        )
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
@@ -336,7 +350,9 @@ def test_audioldm_k_euler(self):
         assert len(audio) == 256
 
         audio_slice = audio[:10]
-        expected_slice = np.array([-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032])
+        expected_slice = np.array(
+            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
+        )
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
@@ -380,8 +396,9 @@ def test_audioldm_negative_prompt(self):
         assert len(audio) == 256
 
         audio_slice = audio[:10]
-        expected_slice = np.array([-0.0051,  0.0050, -0.0060,  0.0034, -0.0026,  0.0033, -0.0027,
-        0.0033, -0.0028,  0.0032])
+        expected_slice = np.array(
+            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
+        )
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
@@ -408,9 +425,7 @@ def test_audioldm_num_waveforms_per_prompt(self):
 
         # test num_waveforms_per_prompt for single prompt
         num_waveforms_per_prompt = 2
-        audios = audioldm_pipe(
-            prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
-        ).audios
+        audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
 
         assert audios.shape == (num_waveforms_per_prompt, 256)
 
@@ -503,7 +518,7 @@ def test_audioldm_width_opt(self):
         config = audioldm_pipe.vocoder.config
         config.model_in_dim = width * 2
         audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
-        output = audioldm_pipe(prompt, num_inference_steps=1, width=width*2)
+        output = audioldm_pipe(prompt, num_inference_steps=1, width=width * 2)
         audio_shape = output.audios.shape
         # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
         assert audio_shape == (1, 256)
@@ -597,12 +612,12 @@ def test_audioldm_ddim(self):
 
         inputs = self.get_inputs(torch_device)
         audio = audioldm_pipe(**inputs).audios[0]
-        
+
         assert audio.ndim == 1
         assert len(audio) == 81952
 
         audio_slice = audio[3880:3890]
-        expected_slice = np.array([-0.0574,  0.2462,  0.3955,  0.4213,  0.3901,  0.3770,  0.2762, 0.0206, -0.2208, -0.3282])
+        expected_slice = np.array([-0.0574, 0.2462, 0.3955, 0.4213, 0.3901, 0.3770, 0.2762, 0.0206, -0.2208, -0.3282])
         max_diff = np.abs(expected_slice - audio_slice).max()
         assert max_diff < 1e-3
 
@@ -619,7 +634,7 @@ def test_audioldm_lms(self):
         assert len(audio) == 81952
 
         audio_slice = audio[27780:27790]
-        expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189,  0.0569,  0.1373,  0.1883, 0.2886,  0.3297,  0.2212])
+        expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
         max_diff = np.abs(expected_slice - audio_slice).max()
         assert max_diff < 1e-3
 
@@ -636,7 +651,7 @@ def test_audioldm_dpm(self):
         assert len(audio) == 81952
 
         audio_slice = audio[69310:69320]
-        expected_slice = np.array([ 0.1842,  0.2411,  0.3127,  0.3069,  0.2287,  0.0948, -0.0071, -0.041 , -0.1293, -0.2075])
+        expected_slice = np.array([0.1842, 0.2411, 0.3127, 0.3069, 0.2287, 0.0948, -0.0071, -0.041, -0.1293, -0.2075])
         max_diff = np.abs(expected_slice - audio_slice).max()
         assert max_diff < 1e-3
 
@@ -730,18 +745,14 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 8, 128, 16)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.6730, -0.9062, 1.0400, 0.4220, -0.9785, 1.817, 0.1906, -1.3430, 1.3330]
-                )
+                expected_slice = np.array([-0.6730, -0.9062, 1.0400, 0.4220, -0.9785, 1.817, 0.1906, -1.3430, 1.3330])
 
                 assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
             elif step == 2:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 8, 128, 16)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.6763, -0.9062, 1.0520, 0.4200, -0.9750, 1.8220, 0.1879, -1.3490, 1.3190]
-                )
+                expected_slice = np.array([-0.6763, -0.9062, 1.0520, 0.4200, -0.9750, 1.8220, 0.1879, -1.3490, 1.3190])
 
                 assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -818,9 +829,7 @@ def test_audioldm_ddim(self):
         inputs = self.get_inputs(torch_device)
         audios = audioldm_pipe(**inputs).audios[0]
 
-        expected_audios = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-        )
+        expected_audios = load_numpy("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main")
         max_diff = np.abs(expected_audios - audios).max()
         assert max_diff < 1e-3
 
@@ -832,9 +841,7 @@ def test_audioldm_lms(self):
         inputs = self.get_inputs(torch_device)
         audios = audioldm_pipe(**inputs).audios[0]
 
-        expected_audios = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-        )
+        expected_audios = load_numpy("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main")
         max_diff = np.abs(expected_audios - audios).max()
         assert max_diff < 1e-3
 
@@ -846,9 +853,7 @@ def test_audioldm_euler(self):
         inputs = self.get_inputs(torch_device)
         audios = audioldm_pipe(**inputs).audios[0]
 
-        expected_audios = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-        )
+        expected_audios = load_numpy("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main")
         max_diff = np.abs(expected_audios - audios).max()
         assert max_diff < 1e-3
 
@@ -861,8 +866,6 @@ def test_audioldm_dpm(self):
         inputs["num_inference_steps"] = 25
         audios = audioldm_pipe(**inputs).audios[0]
 
-        expected_audios = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-        )
+        expected_audios = load_numpy("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main")
         max_diff = np.abs(expected_audios - audios).max()
         assert max_diff < 1e-3

From 447013e4892ee5f7763c84aadb69bd12f05ba95a Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 15:33:48 +0100
Subject: [PATCH 16/66] clean-up: make fix-copies

---
 .../pipelines/audioldm/convert_from_ckpt.py   | 31 ++++++++++++-------
 .../dummy_torch_and_transformers_objects.py   | 15 +++++++++
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index ce21640e1405..be357f007c82 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -42,7 +42,7 @@
 from ...utils.import_utils import BACKENDS_MAPPING
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.shave_segments
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
 def shave_segments(path, n_shave_prefix_segments=1):
     """
     Removes segments. Positive values shave the first segments, negative shave the last segments.
@@ -53,7 +53,7 @@ def shave_segments(path, n_shave_prefix_segments=1):
         return ".".join(path.split(".")[:n_shave_prefix_segments])
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.renew_resnet_paths
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
 def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside resnets to the new naming scheme (local renaming)
@@ -76,7 +76,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.renew_vae_resnet_paths
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
 def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside resnets to the new naming scheme (local renaming)
@@ -93,7 +93,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.renew_attention_paths
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
 def renew_attention_paths(old_list):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
@@ -101,12 +101,21 @@ def renew_attention_paths(old_list):
     mapping = []
     for old_item in old_list:
         new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
         mapping.append({"old": old_item, "new": new_item})
 
     return mapping
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.renew_vae_attention_paths
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_attention_paths
 def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
     """
     Updates paths inside attentions to the new naming scheme (local renaming)
@@ -137,7 +146,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
     return mapping
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.assign_to_checkpoint
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint
 def assign_to_checkpoint(
     paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
 ):
@@ -189,7 +198,7 @@ def assign_to_checkpoint(
             checkpoint[new_path] = old_checkpoint[path["old"]]
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.convert_attn_to_linear
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
 def conv_attn_to_linear(checkpoint):
     keys = list(checkpoint.keys())
     attn_keys = ["query.weight", "key.weight", "value.weight"]
@@ -248,7 +257,7 @@ def create_unet_diffusers_config(original_config, image_size: int):
     return config
 
 
-# Adapted from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.create_vae_diffusers_config
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
 def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
     """
     Creates a VAE config for diffusers based on the config of the original AudioLDM model. Compared to the original
@@ -277,7 +286,7 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
     return config
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.create_diffusers_schedular
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
 def create_diffusers_schedular(original_config):
     schedular = DDIMScheduler(
         num_train_timesteps=original_config.model.params.timesteps,
@@ -288,7 +297,7 @@ def create_diffusers_schedular(original_config):
     return schedular
 
 
-# Adapted from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.convert_ldm_unet_checkpoint
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_unet_checkpoint
 def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint. Compared to the original Stable Diffusion
@@ -466,7 +475,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     return new_checkpoint
 
 
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_checkpoint.convert_ldm_vae_checkpoint
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_vae_checkpoint
 def convert_ldm_vae_checkpoint(checkpoint, config):
     # extract state dict for VAE
     vae_state_dict = {}
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 79755c27e6fe..8ed09b438e6f 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -32,6 +32,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class AudioLDMPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class CycleDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From 08d6a1f626d6e320d2c408b2770490ce7ac5aab1 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 15:39:14 +0100
Subject: [PATCH 17/66] fix: add doc path to toctree

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f3175e9b7f8a..9bd745408cea 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -110,6 +110,8 @@
       title: AltDiffusion
     - local: api/pipelines/audio_diffusion
       title: Audio Diffusion
+    - local: api/pipelines/audioldm
+      title: AudioLDM
     - local: api/pipelines/cycle_diffusion
       title: Cycle Diffusion
     - local: api/pipelines/dance_diffusion

From 9597761ef772ce2b154c8ca9dd3f70b6766c9eaa Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 15:44:23 +0100
Subject: [PATCH 18/66] clean-up: args for conversion script

---
 .../convert_original_audioldm_to_diffusers.py | 23 +++----------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 279e7986b4a6..0adf0cf51280 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -23,13 +23,8 @@
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/sanchitgandhi/convert-audioldm/ldm_trimmed.ckpt",
-        type=str,
-        required=False,  # TODO: revert to True
-        help="Path to the checkpoint to convert.",
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
     )
-    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
     parser.add_argument(
         "--original_config_file",
         default=None,
@@ -48,12 +43,6 @@
         type=str,
         help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
     )
-    parser.add_argument(
-        "--pipeline_type",
-        default=None,
-        type=str,
-        help="The pipeline type. If `None` pipeline will be automatically inferred.",
-    )
     parser.add_argument(
         "--image_size",
         default=None,
@@ -68,7 +57,7 @@
     )
     parser.add_argument(
         "--extract_ema",
-        action="store_false",  # TODO: revert to store_true
+        action="store_true",
         help=(
             "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
             " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
@@ -85,13 +74,7 @@
         action="store_true",
         help="Whether to store pipeline in safetensors format or not.",
     )
-    parser.add_argument(
-        "--dump_path",
-        default="/Users/sanchitgandhi/convert-audioldm/diffusers_out_2",
-        type=str,
-        required=False,  # TODO: revert to True
-        help="Path to the output model.",
-    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
     args = parser.parse_args()
 

From 10c584d383efd3bf7b5a83a2b287354498404c2c Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 15:49:12 +0100
Subject: [PATCH 19/66] clean-up: paths to checkpoints

---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index e4f1fc299751..8ecddf7be322 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -29,14 +29,14 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-# TODO: update doc string with checkpoint path
+# TODO(SG): move checkpoint to correct org and update
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
         >>> from diffusers import AudioLDMPipeline
 
-        >>> pipe = AudioLDMPipeline.from_pretrained("org/audioldm-checkpoint", torch_dtype=torch.float16)
+        >>> pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
         >>> pipe = pipe.to("cuda")
 
         >>> prompt = "A hammer hitting a wooden surface"
@@ -117,7 +117,7 @@ def __init__(
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
                 " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- org/audioldm-checkpoint \n you should change 'sample_size' to 64 in the"
+                " following: \n- sanchit-gandhi/audioldm-text-to-audio \n you should change 'sample_size' to 64 in the"
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"

From 0f154083b7596f7d1e2b3d2cdd26026f65a2fbad Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 16:36:57 +0100
Subject: [PATCH 20/66] fix: use conditional unet

---
 src/diffusers/models/unet_2d.py               | 63 ++++---------------
 src/diffusers/models/unet_2d_condition.py     | 25 ++++++++
 .../pipelines/audioldm/convert_from_ckpt.py   |  4 +-
 .../pipelines/audioldm/pipeline_audioldm.py   |  1 +
 tests/pipelines/audioldm/test_audioldm.py     |  6 +-
 5 files changed, 42 insertions(+), 57 deletions(-)

diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 112710d869f4..770585c020bf 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -101,9 +101,6 @@ def __init__(
         add_attention: bool = True,
         class_embed_type: Optional[str] = None,
         num_class_embeds: Optional[int] = None,
-        extra_film_condition_dim: int = None,
-        extra_film_use_concat: bool = False,
-        cross_attention_dim: int = None,
     ):
         super().__init__()
 
@@ -133,20 +130,6 @@ def __init__(
         else:
             self.class_embedding = None
 
-        # film condition
-        if self.class_embedding is not None and extra_film_condition_dim is not None:
-            raise ValueError("You cannot set both `class_embed_type` and `extra_film_condition_dim`.")
-        self.use_extra_film_by_concat = extra_film_condition_dim is not None and extra_film_use_concat
-
-        if extra_film_condition_dim is not None:
-            self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
-        else:
-            self.film_embedding = None
-
-        if self.use_extra_film_by_concat:
-            # we're concatenating the time embeddings and film embeddings so need to double the resnet embedding dim
-            time_embed_dim = time_embed_dim * 2
-
         self.down_blocks = nn.ModuleList([])
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
@@ -171,35 +154,21 @@ def __init__(
                 attn_num_head_channels=attention_head_dim,
                 downsample_padding=downsample_padding,
                 resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim,
             )
             self.down_blocks.append(down_block)
 
         # mid
-        if cross_attention_dim is not None:
-            self.mid_block = UNetMidBlock2DCrossAttn(
-                in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim,
-                resnet_groups=norm_num_groups,
-            )
-        else:
-            self.mid_block = UNetMidBlock2D(
-                in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                attn_num_head_channels=attention_head_dim,
-                resnet_groups=norm_num_groups,
-                add_attention=add_attention,
-            )
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attn_num_head_channels=attention_head_dim,
+            resnet_groups=norm_num_groups,
+            add_attention=add_attention,
+        )
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
@@ -224,7 +193,6 @@ def __init__(
                 resnet_groups=norm_num_groups,
                 attn_num_head_channels=attention_head_dim,
                 resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -287,15 +255,6 @@ def forward(
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
             emb = emb + class_emb
 
-        if self.film_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when doing film embedding")
-            film_emb = self.film_embedding(class_labels).to(dtype=self.dtype)
-            if self.use_extra_film_by_concat:
-                emb = torch.cat([emb, film_emb], dim=-1)
-            else:
-                emb = emb + film_emb
-
         # 2. pre-process
         skip_sample = sample
         sample = self.conv_in(sample)
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index ba2c09b297b9..4b8bab9622e9 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -145,6 +145,8 @@ def __init__(
         time_cond_proj_dim: Optional[int] = None,
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
+        extra_film_condition_dim: int = None,
+        extra_film_use_concat: bool = False,
     ):
         super().__init__()
 
@@ -196,6 +198,20 @@ def __init__(
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
 
+        # film condition
+        if self.class_embedding is not None and extra_film_condition_dim is not None:
+            raise ValueError("You cannot set both `class_embed_type` and `extra_film_condition_dim`.")
+        self.use_extra_film_by_concat = extra_film_condition_dim is not None and extra_film_use_concat
+
+        if extra_film_condition_dim is not None:
+            self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
+        else:
+            self.film_embedding = None
+
+        if self.use_extra_film_by_concat:
+            # we're concatenating the time embeddings and film embeddings so need to double the resnet embedding dim
+            time_embed_dim = time_embed_dim * 2
+
         if isinstance(only_cross_attention, bool):
             only_cross_attention = [only_cross_attention] * len(down_block_types)
 
@@ -533,6 +549,15 @@ def forward(
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
             emb = emb + class_emb
 
+        if self.film_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when doing film embedding")
+            film_emb = self.film_embedding(class_labels).to(dtype=self.dtype)
+            if self.use_extra_film_by_concat:
+                emb = torch.cat([emb, film_emb], dim=-1)
+            else:
+                emb = emb + film_emb
+
         # 2. pre-process
         sample = self.conv_in(sample)
 
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index be357f007c82..50260cafa5bd 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -35,7 +35,7 @@
     HeunDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
-    UNet2DModel,
+    UNet2DModel, UNet2DConditionModel,
 )
 
 from ...utils import is_omegaconf_available, is_safetensors_available
@@ -882,7 +882,7 @@ def load_pipeline_from_original_audioldm_ckpt(
 
     # Convert the UNet2DModel
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet = UNet2DModel(**unet_config)
+    unet = UNet2DConditionModel(**unet_config)
 
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(
         checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 8ecddf7be322..4aaf1f88b0dc 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -575,6 +575,7 @@ def __call__(
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
+                    encoder_hidden_states=None,
                     class_labels=prompt_embeds,
                 ).sample
 
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 4f61b7b369b4..8a15b7490bcf 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -38,7 +38,7 @@
     EulerDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
-    UNet2DModel,
+    UNet2DConditionModel,
     logging,
 )
 from diffusers.utils import load_numpy, nightly, slow, torch_device
@@ -55,7 +55,7 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
 
     def get_dummy_components(self):
         torch.manual_seed(0)
-        unet = UNet2DModel(
+        unet = UNet2DConditionModel(
             block_out_channels=(32, 64),
             layers_per_block=2,
             sample_size=32,
@@ -495,7 +495,7 @@ def test_audioldm_height_opt(self):
 
         config = dict(audioldm_pipe.unet.config)
         config["sample_size"] = 96
-        audioldm_pipe.unet = UNet2DModel.from_config(config).to(torch_device)
+        audioldm_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
         output = audioldm_pipe(prompt, num_inference_steps=1, width=8)  # need to keep width fixed for vocoder
         audio_shape = output.audios.shape
         assert audio_shape == (1, 768)

From d99c9e845fde67e1ebba7a32e039f4e0910d7779 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 16:37:51 +0100
Subject: [PATCH 21/66] clean-up: make style

---
 src/diffusers/models/unet_2d.py                       | 2 +-
 src/diffusers/pipelines/audioldm/convert_from_ckpt.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 770585c020bf..35f5dc34574c 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -21,7 +21,7 @@
 from ..utils import BaseOutput
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
-from .unet_2d_blocks import UNetMidBlock2D, UNetMidBlock2DCrossAttn, get_down_block, get_up_block
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
 
 
 @dataclass
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index 50260cafa5bd..b8769a623f8a 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -35,7 +35,7 @@
     HeunDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
-    UNet2DModel, UNet2DConditionModel,
+    UNet2DConditionModel,
 )
 
 from ...utils import is_omegaconf_available, is_safetensors_available

From 293f2a4066253de383414cce1482db5d0cf969a2 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 17:03:28 +0100
Subject: [PATCH 22/66] fix: type hints for UNet

---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 4aaf1f88b0dc..be42122a6785 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -21,7 +21,7 @@
 from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
 
 from ...configuration_utils import FrozenDict
-from ...models import AutoencoderKL, UNet2DModel
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import deprecate, is_accelerate_available, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
@@ -62,7 +62,7 @@ class AudioLDMPipeline(DiffusionPipeline):
         tokenizer ([`PreTrainedTokenizer`]):
             Tokenizer of class
             [RobertaTokenizer](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaTokenizer).
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded audio latents.
+        unet ([`UNet2DConditionModel`]): U-Net architecture to denoise the encoded audio latents.
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
@@ -76,7 +76,7 @@ def __init__(
         vae: AutoencoderKL,
         text_encoder: ClapTextModelWithProjection,
         tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
-        unet: UNet2DModel,
+        unet: UNet2DConditionModel,
         scheduler: KarrasDiffusionSchedulers,
         vocoder: SpeechT5HifiGan,
     ):

From 8b52493a84b25aab6c257a7b401fa65b7be72716 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 17:24:48 +0100
Subject: [PATCH 23/66] clean-up: docstring for UNet

---
 src/diffusers/models/unet_2d_condition.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 4b8bab9622e9..4f58096686cc 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -103,6 +103,8 @@ class conditioning with `class_embed_type` equal to `None`.
             The dimension of `cond_proj` layer in timestep embedding.
         conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
         conv_out_kernel (`int`, *optional*, default to `3`): the Kernel size of `conv_out` layer.
+        extra_film_condition_dim (`int`, *optional*, default to `None`): The dimensionality of the extra film conditioning layer.
+        extra_film_use_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the extra film embedding with the time embedding or sum them.
     """
 
     _supports_gradient_checkpointing = True
@@ -205,13 +207,12 @@ def __init__(
 
         if extra_film_condition_dim is not None:
             self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
+            if extra_film_use_concat:
+                # we're concatenating the time embeddings and film embeddings so need to double the resnet embedding dim
+                time_embed_dim = time_embed_dim * 2
         else:
             self.film_embedding = None
 
-        if self.use_extra_film_by_concat:
-            # we're concatenating the time embeddings and film embeddings so need to double the resnet embedding dim
-            time_embed_dim = time_embed_dim * 2
-
         if isinstance(only_cross_attention, bool):
             only_cross_attention = [only_cross_attention] * len(down_block_types)
 
@@ -551,7 +552,7 @@ def forward(
 
         if self.film_embedding is not None:
             if class_labels is None:
-                raise ValueError("class_labels should be provided when doing film embedding")
+                raise ValueError("class_labels should be provided when extra_film_condition_dim > 0")
             film_emb = self.film_embedding(class_labels).to(dtype=self.dtype)
             if self.use_extra_film_by_concat:
                 emb = torch.cat([emb, film_emb], dim=-1)

From 00399217f7fa2ac16110a750659b7eb56339647f Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 17:27:10 +0100
Subject: [PATCH 24/66] clean-up: make style

---
 src/diffusers/models/unet_2d_condition.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 4f58096686cc..258abf2409ab 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -103,8 +103,10 @@ class conditioning with `class_embed_type` equal to `None`.
             The dimension of `cond_proj` layer in timestep embedding.
         conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
         conv_out_kernel (`int`, *optional*, default to `3`): the Kernel size of `conv_out` layer.
-        extra_film_condition_dim (`int`, *optional*, default to `None`): The dimensionality of the extra film conditioning layer.
-        extra_film_use_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the extra film embedding with the time embedding or sum them.
+        extra_film_condition_dim (`int`, *optional*, default to `None`):
+            The dimensionality of the extra film conditioning layer.
+        extra_film_use_concat (`bool`, *optional*, defaults to `False`):
+            Whether to concatenate the extra film embedding with the time embedding or sum them.
     """
 
     _supports_gradient_checkpointing = True

From 0be07899dc3228bff8aa478ecb498a930f88f748 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 17:29:21 +0100
Subject: [PATCH 25/66] clean-up: remove duplicate in docstring

---
 src/diffusers/models/unet_2d_condition.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 207fc827d55e..05e11778de8d 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -102,7 +102,6 @@ class conditioning with `class_embed_type` equal to `None`.
         time_cond_proj_dim (`int`, *optional*, default to `None`):
             The dimension of `cond_proj` layer in timestep embedding.
         conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
-        conv_out_kernel (`int`, *optional*, default to `3`): the Kernel size of `conv_out` layer.
         conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
         projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.

From 3f5f8630442de41ef900877b4fd30d23d4ac8391 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 17:34:33 +0100
Subject: [PATCH 26/66] clean-up: make style

---
 src/diffusers/models/unet_2d_condition.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 05e11778de8d..8ffa9a63ffb2 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -105,8 +105,10 @@ class conditioning with `class_embed_type` equal to `None`.
         conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
         projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
-        extra_film_condition_dim (`int`, *optional*, default to `None`): The dimensionality of the extra film conditioning layer.
-        extra_film_use_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the extra film embedding with the time embedding or sum them.
+        extra_film_condition_dim (`int`, *optional*, default to `None`):
+            The dimensionality of the extra film conditioning layer.
+        extra_film_use_concat (`bool`, *optional*, defaults to `False`):
+            Whether to concatenate the extra film embedding with the time embedding or sum them.
     """
 
     _supports_gradient_checkpointing = True

From 3033ac1f7387d623cf808ac110a24c23ea550d0d Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 17:35:38 +0100
Subject: [PATCH 27/66] clean-up: make fix-copies

---
 .../versatile_diffusion/modeling_text_unet.py | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 4adf9eed0e29..8c7f0ed1aa10 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -186,6 +186,10 @@ class conditioning with `class_embed_type` equal to `None`.
         conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
         projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
+        extra_film_condition_dim (`int`, *optional*, default to `None`):
+            The dimensionality of the extra film conditioning layer.
+        extra_film_use_concat (`bool`, *optional*, defaults to `False`):
+            Whether to concatenate the extra film embedding with the time embedding or sum them.
     """
 
     _supports_gradient_checkpointing = True
@@ -234,6 +238,8 @@ def __init__(
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
+        extra_film_condition_dim: int = None,
+        extra_film_use_concat: bool = False,
     ):
         super().__init__()
 
@@ -323,6 +329,19 @@ def __init__(
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
 
+        # film condition
+        if self.class_embedding is not None and extra_film_condition_dim is not None:
+            raise ValueError("You cannot set both `class_embed_type` and `extra_film_condition_dim`.")
+        self.use_extra_film_by_concat = extra_film_condition_dim is not None and extra_film_use_concat
+
+        if extra_film_condition_dim is not None:
+            self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
+            if extra_film_use_concat:
+                # we're concatenating the time embeddings and film embeddings so need to double the resnet embedding dim
+                time_embed_dim = time_embed_dim * 2
+        else:
+            self.film_embedding = None
+
         if isinstance(only_cross_attention, bool):
             only_cross_attention = [only_cross_attention] * len(down_block_types)
 
@@ -660,6 +679,15 @@ def forward(
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
             emb = emb + class_emb
 
+        if self.film_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when extra_film_condition_dim > 0")
+            film_emb = self.film_embedding(class_labels).to(dtype=self.dtype)
+            if self.use_extra_film_by_concat:
+                emb = torch.cat([emb, film_emb], dim=-1)
+            else:
+                emb = emb + film_emb
+
         # 2. pre-process
         sample = self.conv_in(sample)
 

From dd1882fae87ca774697d021651520ca998449614 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Tue, 21 Feb 2023 18:35:09 +0100
Subject: [PATCH 28/66] clean-up: move imports to start in code snippet

---
 docs/source/en/api/pipelines/audioldm.mdx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/audioldm.mdx b/docs/source/en/api/pipelines/audioldm.mdx
index 9c8aed78faa2..f6c8966a4769 100644
--- a/docs/source/en/api/pipelines/audioldm.mdx
+++ b/docs/source/en/api/pipelines/audioldm.mdx
@@ -30,6 +30,7 @@ This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit
 ```python
 from diffusers import AudioLDMPipeline
 import torch
+import scipy
 
 repo_id = "sanchit-gandhi/audioldm-text-to-audio"
 pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
@@ -39,8 +40,6 @@ prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
 audio = pipe(prompt, num_inference_steps=10, height=512).audios[0]
 
 # save the audio sample as a .wav file
-import scipy
-
 scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
 ```
 

From 4471f0887cb5cc6a1a886903e0d67412591aa349 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 22 Feb 2023 09:49:35 +0100
Subject: [PATCH 29/66] fix: pass cross_attention_dim as a list/tuple to unet

---
 src/diffusers/models/attention.py             |  2 +-
 src/diffusers/models/unet_2d_condition.py     | 22 +++++--
 .../pipelines/audioldm/convert_from_ckpt.py   |  6 +-
 tests/models/test_models_unet_2d_condition.py | 62 +++++++++++++++++++
 4 files changed, 84 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 5d37b88b8e97..3cdc7177a411 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -236,7 +236,7 @@ def __init__(
         if cross_attention_dim is not None:
             self.attn2 = CrossAttention(
                 query_dim=dim,
-                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                cross_attention_dim=cross_attention_dim,
                 heads=num_attention_heads,
                 dim_head=attention_head_dim,
                 dropout=dropout,
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 8ffa9a63ffb2..7edd23015a91 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -86,7 +86,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
             If `None`, it will skip the normalization and activation layers in post-processing
         norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
@@ -138,7 +139,7 @@ def __init__(
         act_fn: str = "silu",
         norm_num_groups: Optional[int] = 32,
         norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
         attention_head_dim: Union[int, Tuple[int]] = 8,
         dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
@@ -180,6 +181,11 @@ def __init__(
                 f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
             )
 
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
         # input
         conv_in_padding = (conv_in_kernel - 1) // 2
         self.conv_in = nn.Conv2d(
@@ -258,6 +264,9 @@ def __init__(
         if isinstance(attention_head_dim, int):
             attention_head_dim = (attention_head_dim,) * len(down_block_types)
 
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
         # down
         output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_block_types):
@@ -275,7 +284,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim[i],
                 attn_num_head_channels=attention_head_dim[i],
                 downsample_padding=downsample_padding,
                 dual_cross_attention=dual_cross_attention,
@@ -295,7 +304,7 @@ def __init__(
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
                 resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim[-1],
                 attn_num_head_channels=attention_head_dim[-1],
                 resnet_groups=norm_num_groups,
                 dual_cross_attention=dual_cross_attention,
@@ -309,7 +318,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim[-1],
                 attn_num_head_channels=attention_head_dim[-1],
                 resnet_groups=norm_num_groups,
                 resnet_time_scale_shift=resnet_time_scale_shift,
@@ -325,6 +334,7 @@ def __init__(
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
         only_cross_attention = list(reversed(only_cross_attention))
 
         output_channel = reversed_block_out_channels[0]
@@ -353,7 +363,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=reversed_cross_attention_dim[i],
                 attn_num_head_channels=reversed_attention_head_dim[i],
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index b8769a623f8a..f9d0a673138f 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -236,6 +236,10 @@ def create_unet_diffusers_config(original_config, image_size: int):
 
     vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
 
+    cross_attention_dim = (
+        unet_params.cross_attention_dim if "cross_attention_dim" in unet_params else block_out_channels
+    )
+
     extra_film_condition_dim = (
         unet_params.extra_film_condition_dim if "extra_film_condition_dim" in unet_params else None
     )
@@ -249,7 +253,7 @@ def create_unet_diffusers_config(original_config, image_size: int):
         up_block_types=tuple(up_block_types),
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=True,  # TODO(SG): hacky - what are we doing re cross attention?
+        cross_attention_dim=cross_attention_dim,
         extra_film_condition_dim=extra_film_condition_dim,
         extra_film_use_concat=extra_film_use_concat,
     )
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 6ee8c2ffc002..7c9d17193ae3 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -199,6 +199,68 @@ def test_model_with_use_linear_projection(self):
         expected_shape = inputs_dict["sample"].shape
         self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
+    def test_model_with_cross_attention_dim_tuple(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["cross_attention_dim"] = (32, 32)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_model_with_extra_film_condition_dim(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        batch_size, _, _, sample_size = inputs_dict["sample"].shape
+        init_dict["extra_film_condition_dim"] = sample_size
+        inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_model_with_extra_film_use_concat(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        batch_size, _, _, sample_size = inputs_dict["sample"].shape
+        init_dict["extra_film_condition_dim"] = sample_size
+        init_dict["extra_film_use_concat"] = True
+        inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
     def test_model_attention_slicing(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 

From e81696ff70a92bdb7fb65cf3bdf70a6f4b587587 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 22 Feb 2023 09:51:40 +0100
Subject: [PATCH 30/66] clean-up: make fix-copies

---
 .../versatile_diffusion/modeling_text_unet.py | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 8c7f0ed1aa10..3419d5adae13 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -167,7 +167,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
         norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
             If `None`, it will skip the normalization and activation layers in post-processing
         norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlockFlat`]. Choose from `default` or `scale_shift`.
@@ -224,7 +225,7 @@ def __init__(
         act_fn: str = "silu",
         norm_num_groups: Optional[int] = 32,
         norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
         attention_head_dim: Union[int, Tuple[int]] = 8,
         dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
@@ -270,6 +271,12 @@ def __init__(
                 f" {attention_head_dim}. `down_block_types`: {down_block_types}."
             )
 
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                "Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`:"
+                f" {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
         # input
         conv_in_padding = (conv_in_kernel - 1) // 2
         self.conv_in = LinearMultiDim(
@@ -348,6 +355,9 @@ def __init__(
         if isinstance(attention_head_dim, int):
             attention_head_dim = (attention_head_dim,) * len(down_block_types)
 
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
         # down
         output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_block_types):
@@ -365,7 +375,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim[i],
                 attn_num_head_channels=attention_head_dim[i],
                 downsample_padding=downsample_padding,
                 dual_cross_attention=dual_cross_attention,
@@ -385,7 +395,7 @@ def __init__(
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
                 resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim[-1],
                 attn_num_head_channels=attention_head_dim[-1],
                 resnet_groups=norm_num_groups,
                 dual_cross_attention=dual_cross_attention,
@@ -399,7 +409,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim[-1],
                 attn_num_head_channels=attention_head_dim[-1],
                 resnet_groups=norm_num_groups,
                 resnet_time_scale_shift=resnet_time_scale_shift,
@@ -415,6 +425,7 @@ def __init__(
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
         only_cross_attention = list(reversed(only_cross_attention))
 
         output_channel = reversed_block_out_channels[0]
@@ -443,7 +454,7 @@ def __init__(
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
+                cross_attention_dim=reversed_cross_attention_dim[i],
                 attn_num_head_channels=reversed_attention_head_dim[i],
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,

From b8165a1f5f527b44634cb018fe5bfb67977d573c Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 22 Feb 2023 11:14:58 +0100
Subject: [PATCH 31/66] fix: update checkpoint path

---
 docs/source/en/api/pipelines/audioldm.mdx     | 12 ++++-----
 .../pipelines/audioldm/pipeline_audioldm.py   |  5 ++--
 tests/pipelines/audioldm/test_audioldm.py     | 26 +++++++++----------
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/docs/source/en/api/pipelines/audioldm.mdx b/docs/source/en/api/pipelines/audioldm.mdx
index f6c8966a4769..385f622abc02 100644
--- a/docs/source/en/api/pipelines/audioldm.mdx
+++ b/docs/source/en/api/pipelines/audioldm.mdx
@@ -25,14 +25,14 @@ This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit
 
 ## Text-to-Audio
 
-- *Text-to-Audio* [sanchit-gandhi/audioldm-text-to-audio](https://huggingface.co/sanchit-gandhi/audioldm-text-to-audio) with [`AudioLDMPipeline`]
+- *Text-to-Audio* [cvssp/audioldm](https://huggingface.co/cvssp/audioldm) with [`AudioLDMPipeline`]
 
 ```python
 from diffusers import AudioLDMPipeline
 import torch
 import scipy
 
-repo_id = "sanchit-gandhi/audioldm-text-to-audio"
+repo_id = "cvssp/audioldm"
 pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 
@@ -64,14 +64,12 @@ method, or pass the `scheduler` argument to the `from_pretrained` method of the
 ```python
 >>> from diffusers import AudioLDMPipeline, DPMSolverMultistepScheduler
 
->>> pipeline = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio")
+>>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
 >>> pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
 
 >>> # or
->>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained(
-...     "sanchit-gandhi/audioldm-text-to-audio", subfolder="scheduler"
-... )
->>> pipeline = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", scheduler=dpm_scheduler)
+>>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained("cvssp/audioldm", subfolder="scheduler")
+>>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm", scheduler=dpm_scheduler)
 ```
 
 ## AudioLDMPipeline
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index be42122a6785..47b588631092 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -29,14 +29,13 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-# TODO(SG): move checkpoint to correct org and update
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
         >>> from diffusers import AudioLDMPipeline
 
-        >>> pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
+        >>> pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
         >>> pipe = pipe.to("cuda")
 
         >>> prompt = "A hammer hitting a wooden surface"
@@ -117,7 +116,7 @@ def __init__(
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
                 " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- sanchit-gandhi/audioldm-text-to-audio \n you should change 'sample_size' to 64 in the"
+                " following: \n- cvssp/audioldm \n you should change 'sample_size' to 64 in the"
                 " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 8a15b7490bcf..3c65871d9b4a 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -605,7 +605,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
         return inputs
 
     def test_audioldm_ddim(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio")
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
         audioldm_pipe.scheduler = DDIMScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
@@ -622,7 +622,7 @@ def test_audioldm_ddim(self):
         assert max_diff < 1e-3
 
     def test_audioldm_lms(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio")
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
         audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
@@ -639,7 +639,7 @@ def test_audioldm_lms(self):
         assert max_diff < 1e-3
 
     def test_audioldm_dpm(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio")
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
         audioldm_pipe.scheduler = DPMSolverMultistepScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
@@ -658,7 +658,7 @@ def test_audioldm_dpm(self):
     def test_audioldm_attention_slicing(self):
         # TODO(SG): fix or remove. This test yields the same memory for with / without attn slicing
         torch.cuda.reset_peak_memory_stats()
-        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -684,7 +684,7 @@ def test_audioldm_attention_slicing(self):
 
     def test_audioldm_vae_slicing(self):
         torch.cuda.reset_peak_memory_stats()
-        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -717,7 +717,7 @@ def test_audioldm_vae_slicing(self):
     def test_audioldm_fp16_vs_autocast(self):
         # this test makes sure that the original model with autocast
         # and the new model with fp16 yield the same result
-        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -758,7 +758,7 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
 
         callback_fn.has_been_called = False
 
-        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -769,7 +769,7 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
         assert number_of_steps == inputs["num_inference_steps"]
 
     def test_audioldm_low_cpu_mem_usage(self):
-        pipeline_id = "sanchit-gandhi/audioldm-text-to-audio"
+        pipeline_id = "cvssp/audioldm"
 
         start_time = time.time()
         pipeline_low_cpu_mem_usage = AudioLDMPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
@@ -787,7 +787,7 @@ def test_audioldm_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.reset_max_memory_allocated()
         torch.cuda.reset_peak_memory_stats()
 
-        pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio", torch_dtype=torch.float16)
+        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing(1)
@@ -823,7 +823,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
         return inputs
 
     def test_audioldm_ddim(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio").to(torch_device)
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm").to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
@@ -834,7 +834,7 @@ def test_audioldm_ddim(self):
         assert max_diff < 1e-3
 
     def test_audioldm_lms(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio").to(torch_device)
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm").to(torch_device)
         audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe.set_progress_bar_config(disable=None)
 
@@ -846,7 +846,7 @@ def test_audioldm_lms(self):
         assert max_diff < 1e-3
 
     def test_audioldm_euler(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio").to(torch_device)
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm").to(torch_device)
         audioldm_pipe.scheduler = EulerDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe.set_progress_bar_config(disable=None)
 
@@ -858,7 +858,7 @@ def test_audioldm_euler(self):
         assert max_diff < 1e-3
 
     def test_audioldm_dpm(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("sanchit-gandhi/audioldm-text-to-audio").to(torch_device)
+        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm").to(torch_device)
         audioldm_pipe.scheduler = DPMSolverMultistepScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe.set_progress_bar_config(disable=None)
 

From 1a1dc585c276f3d7e0721f77b5e4d46e8b36168d Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 22 Feb 2023 14:05:08 +0100
Subject: [PATCH 32/66] fix: unet cross_attention_dim in tests

---
 tests/pipelines/audioldm/test_audioldm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 3c65871d9b4a..2348908a26cf 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -63,7 +63,7 @@ def get_dummy_components(self):
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
+            cross_attention_dim=(32, 64),
             extra_film_condition_dim=32,
             extra_film_use_concat=True,
         )

From 3947e375de4c16706e4da7c999ad8cdc720e3db1 Mon Sep 17 00:00:00 2001
From: William Berman <WLBberman@gmail.com>
Date: Fri, 24 Feb 2023 11:09:50 -0800
Subject: [PATCH 33/66] film embeddings -> class embeddings

---
 src/diffusers/models/unet_2d_condition.py     | 45 ++++++++-----------
 .../versatile_diffusion/modeling_text_unet.py | 45 ++++++++-----------
 2 files changed, 36 insertions(+), 54 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 7edd23015a91..973eb0b59f72 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -153,8 +153,7 @@ def __init__(
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
-        extra_film_condition_dim: int = None,
-        extra_film_use_concat: bool = False,
+        class_embeddings_concat: bool = False,
     ):
         super().__init__()
 
@@ -239,25 +238,14 @@ def __init__(
             # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
             # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
             self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
         else:
             self.class_embedding = None
 
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
 
-        # film condition
-        if self.class_embedding is not None and extra_film_condition_dim is not None:
-            raise ValueError("You cannot set both `class_embed_type` and `extra_film_condition_dim`.")
-        self.use_extra_film_by_concat = extra_film_condition_dim is not None and extra_film_use_concat
-
-        if extra_film_condition_dim is not None:
-            self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
-            if extra_film_use_concat:
-                # we're concatenating the time embeddings and film embeddings so need to double the resnet embedding dim
-                time_embed_dim = time_embed_dim * 2
-        else:
-            self.film_embedding = None
-
         if isinstance(only_cross_attention, bool):
             only_cross_attention = [only_cross_attention] * len(down_block_types)
 
@@ -267,6 +255,14 @@ def __init__(
         if isinstance(cross_attention_dim, int):
             cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
 
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
         # down
         output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_block_types):
@@ -279,7 +275,7 @@ def __init__(
                 num_layers=layers_per_block,
                 in_channels=input_channel,
                 out_channels=output_channel,
-                temb_channels=time_embed_dim,
+                temb_channels=blocks_time_embed_dim,
                 add_downsample=not is_final_block,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
@@ -299,7 +295,7 @@ def __init__(
         if mid_block_type == "UNetMidBlock2DCrossAttn":
             self.mid_block = UNetMidBlock2DCrossAttn(
                 in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
+                temb_channels=blocks_time_embed_dim,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
@@ -314,7 +310,7 @@ def __init__(
         elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
             self.mid_block = UNetMidBlock2DSimpleCrossAttn(
                 in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
+                temb_channels=blocks_time_embed_dim,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
@@ -358,7 +354,7 @@ def __init__(
                 in_channels=input_channel,
                 out_channels=output_channel,
                 prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
+                temb_channels=blocks_time_embed_dim,
                 add_upsample=add_upsample,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
@@ -597,16 +593,11 @@ def forward(
                 class_labels = self.time_proj(class_labels)
 
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-            emb = emb + class_emb
 
-        if self.film_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when extra_film_condition_dim > 0")
-            film_emb = self.film_embedding(class_labels).to(dtype=self.dtype)
-            if self.use_extra_film_by_concat:
-                emb = torch.cat([emb, film_emb], dim=-1)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
             else:
-                emb = emb + film_emb
+                emb = emb + class_emb
 
         # 2. pre-process
         sample = self.conv_in(sample)
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 3419d5adae13..cfd7b8160822 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -239,8 +239,7 @@ def __init__(
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
-        extra_film_condition_dim: int = None,
-        extra_film_use_concat: bool = False,
+        class_embeddings_concat: bool = False,
     ):
         super().__init__()
 
@@ -330,25 +329,14 @@ def __init__(
             # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
             # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
             self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
         else:
             self.class_embedding = None
 
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
 
-        # film condition
-        if self.class_embedding is not None and extra_film_condition_dim is not None:
-            raise ValueError("You cannot set both `class_embed_type` and `extra_film_condition_dim`.")
-        self.use_extra_film_by_concat = extra_film_condition_dim is not None and extra_film_use_concat
-
-        if extra_film_condition_dim is not None:
-            self.film_embedding = nn.Linear(extra_film_condition_dim, time_embed_dim)
-            if extra_film_use_concat:
-                # we're concatenating the time embeddings and film embeddings so need to double the resnet embedding dim
-                time_embed_dim = time_embed_dim * 2
-        else:
-            self.film_embedding = None
-
         if isinstance(only_cross_attention, bool):
             only_cross_attention = [only_cross_attention] * len(down_block_types)
 
@@ -358,6 +346,14 @@ def __init__(
         if isinstance(cross_attention_dim, int):
             cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
 
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
         # down
         output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_block_types):
@@ -370,7 +366,7 @@ def __init__(
                 num_layers=layers_per_block,
                 in_channels=input_channel,
                 out_channels=output_channel,
-                temb_channels=time_embed_dim,
+                temb_channels=blocks_time_embed_dim,
                 add_downsample=not is_final_block,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
@@ -390,7 +386,7 @@ def __init__(
         if mid_block_type == "UNetMidBlockFlatCrossAttn":
             self.mid_block = UNetMidBlockFlatCrossAttn(
                 in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
+                temb_channels=blocks_time_embed_dim,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
@@ -405,7 +401,7 @@ def __init__(
         elif mid_block_type == "UNetMidBlockFlatSimpleCrossAttn":
             self.mid_block = UNetMidBlockFlatSimpleCrossAttn(
                 in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
+                temb_channels=blocks_time_embed_dim,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
                 output_scale_factor=mid_block_scale_factor,
@@ -449,7 +445,7 @@ def __init__(
                 in_channels=input_channel,
                 out_channels=output_channel,
                 prev_output_channel=prev_output_channel,
-                temb_channels=time_embed_dim,
+                temb_channels=blocks_time_embed_dim,
                 add_upsample=add_upsample,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
@@ -688,16 +684,11 @@ def forward(
                 class_labels = self.time_proj(class_labels)
 
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-            emb = emb + class_emb
 
-        if self.film_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when extra_film_condition_dim > 0")
-            film_emb = self.film_embedding(class_labels).to(dtype=self.dtype)
-            if self.use_extra_film_by_concat:
-                emb = torch.cat([emb, film_emb], dim=-1)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
             else:
-                emb = emb + film_emb
+                emb = emb + class_emb
 
         # 2. pre-process
         sample = self.conv_in(sample)

From 1503f757e63ea8d9d6acb4c3af53affd3633450a Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Mon, 27 Feb 2023 10:04:37 +0100
Subject: [PATCH 34/66] Apply suggestions from code review

Co-authored-by: Will Berman <wlbberman@gmail.com>
---
 .../pipelines/audioldm/pipeline_audioldm.py   | 52 +++----------------
 tests/pipelines/audioldm/test_audioldm.py     |  1 -
 2 files changed, 6 insertions(+), 47 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 47b588631092..d855ef4a1f61 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -81,51 +81,6 @@ def __init__(
     ):
         super().__init__()
 
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- cvssp/audioldm \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
 
         self.register_modules(
             vae=vae,
@@ -136,8 +91,8 @@ def __init__(
             vocoder=vocoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config()
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
     def enable_vae_slicing(self):
         r"""
         Enable sliced VAE decoding.
@@ -147,6 +102,7 @@ def enable_vae_slicing(self):
         """
         self.vae.enable_slicing()
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
     def disable_vae_slicing(self):
         r"""
         Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
@@ -171,6 +127,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
             cpu_offload(cpu_offloaded_model, device)
 
     @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
         r"""
         Returns the device on which the pipeline's models will be executed. After calling
@@ -341,6 +298,7 @@ def mel_spectrogram_to_waveform(self, mel_spectrogram):
         waveform = waveform.cpu().detach()
         return waveform
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -358,6 +316,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
         self,
         prompt,
@@ -405,6 +364,7 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 2348908a26cf..605dc86a5925 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -47,7 +47,6 @@
 from ...test_pipelines_common import PipelineTesterMixin
 
 
-torch.backends.cuda.matmul.allow_tf32 = False
 
 
 class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

From 074f883774743e2ac914704ff350c632f0f9914f Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 27 Feb 2023 10:39:10 +0100
Subject: [PATCH 35/66] fix: unet film embed to use existing args

---
 src/diffusers/models/unet_2d_condition.py          | 13 ++++++++-----
 .../pipelines/audioldm/convert_from_ckpt.py        | 14 ++++++++------
 .../versatile_diffusion/modeling_text_unet.py      | 13 ++++++++-----
 tests/pipelines/audioldm/test_audioldm.py          |  7 +++----
 4 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 973eb0b59f72..4d496b2171e3 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -92,7 +92,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
         class_embed_type (`str`, *optional*, defaults to None): The type of class embedding to use which is ultimately
-            summed with the time embeddings. Choose from `None`, `"timestep"`, `"identity"`, or `"projection"`.
+            summed with the time embeddings. Choose from `None`, `"timestep"`, `"identity"`, `"projection"`, or
+            `"simple_projection"`.
         num_class_embeds (`int`, *optional*, defaults to None):
             Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
             class conditioning with `class_embed_type` equal to `None`.
@@ -106,10 +107,8 @@ class conditioning with `class_embed_type` equal to `None`.
         conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
         projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
-        extra_film_condition_dim (`int`, *optional*, default to `None`):
-            The dimensionality of the extra film conditioning layer.
-        extra_film_use_concat (`bool`, *optional*, defaults to `False`):
-            Whether to concatenate the extra film embedding with the time embedding or sum them.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+        embeddings with the class embeddings.
     """
 
     _supports_gradient_checkpointing = True
@@ -239,6 +238,10 @@ def __init__(
             # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
             self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
         elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
             self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
         else:
             self.class_embedding = None
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
index f9d0a673138f..4f0df0e418d6 100644
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
@@ -240,10 +240,11 @@ def create_unet_diffusers_config(original_config, image_size: int):
         unet_params.cross_attention_dim if "cross_attention_dim" in unet_params else block_out_channels
     )
 
-    extra_film_condition_dim = (
+    class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None
+    projection_class_embeddings_input_dim = (
         unet_params.extra_film_condition_dim if "extra_film_condition_dim" in unet_params else None
     )
-    extra_film_use_concat = unet_params.extra_film_use_concat if "extra_film_use_concat" in unet_params else None
+    class_embeddings_concat = unet_params.extra_film_use_concat if "extra_film_use_concat" in unet_params else None
 
     config = dict(
         sample_size=image_size // vae_scale_factor,
@@ -254,8 +255,9 @@ def create_unet_diffusers_config(original_config, image_size: int):
         block_out_channels=tuple(block_out_channels),
         layers_per_block=unet_params.num_res_blocks,
         cross_attention_dim=cross_attention_dim,
-        extra_film_condition_dim=extra_film_condition_dim,
-        extra_film_use_concat=extra_film_use_concat,
+        class_embed_type=class_embed_type,
+        projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+        class_embeddings_concat=class_embeddings_concat,
     )
 
     return config
@@ -342,8 +344,8 @@ def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False
     new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
     new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
 
-    new_checkpoint["film_embedding.weight"] = unet_state_dict["film_emb.weight"]
-    new_checkpoint["film_embedding.bias"] = unet_state_dict["film_emb.bias"]
+    new_checkpoint["class_embedding.weight"] = unet_state_dict["film_emb.weight"]
+    new_checkpoint["class_embedding.bias"] = unet_state_dict["film_emb.bias"]
 
     new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index cfd7b8160822..411a678da4e3 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -173,7 +173,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlockFlat`]. Choose from `default` or `scale_shift`.
         class_embed_type (`str`, *optional*, defaults to None): The type of class embedding to use which is ultimately
-            summed with the time embeddings. Choose from `None`, `"timestep"`, `"identity"`, or `"projection"`.
+            summed with the time embeddings. Choose from `None`, `"timestep"`, `"identity"`, `"projection"`, or
+            `"simple_projection"`.
         num_class_embeds (`int`, *optional*, defaults to None):
             Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
             class conditioning with `class_embed_type` equal to `None`.
@@ -187,10 +188,8 @@ class conditioning with `class_embed_type` equal to `None`.
         conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
         projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
-        extra_film_condition_dim (`int`, *optional*, default to `None`):
-            The dimensionality of the extra film conditioning layer.
-        extra_film_use_concat (`bool`, *optional*, defaults to `False`):
-            Whether to concatenate the extra film embedding with the time embedding or sum them.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+        embeddings with the class embeddings.
     """
 
     _supports_gradient_checkpointing = True
@@ -330,6 +329,10 @@ def __init__(
             # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
             self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
         elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
             self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
         else:
             self.class_embedding = None
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 605dc86a5925..387144e9062f 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -47,8 +47,6 @@
 from ...test_pipelines_common import PipelineTesterMixin
 
 
-
-
 class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = AudioLDMPipeline
 
@@ -63,8 +61,9 @@ def get_dummy_components(self):
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
             up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
             cross_attention_dim=(32, 64),
-            extra_film_condition_dim=32,
-            extra_film_use_concat=True,
+            class_embed_type="simple_projection",
+            projection_class_embeddings_input_dim=32,
+            class_embeddings_concat=True,
         )
         scheduler = DDIMScheduler(
             beta_start=0.00085,

From 94dc7617560cdd702db1a852eeee8b8d8f6afb78 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 27 Feb 2023 10:46:19 +0100
Subject: [PATCH 36/66] fix: unet tests to use existing args

---
 tests/models/test_models_unet_2d_condition.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 7c9d17193ae3..ed1a52beb02d 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -218,11 +218,14 @@ def test_model_with_cross_attention_dim_tuple(self):
         expected_shape = inputs_dict["sample"].shape
         self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
-    def test_model_with_extra_film_condition_dim(self):
+    def test_model_with_simple_projection(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 
         batch_size, _, _, sample_size = inputs_dict["sample"].shape
-        init_dict["extra_film_condition_dim"] = sample_size
+
+        init_dict["class_embed_type"] = "simple_projection"
+        init_dict["projection_class_embeddings_input_dim"] = sample_size
+
         inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
 
         model = self.model_class(**init_dict)
@@ -239,12 +242,15 @@ def test_model_with_extra_film_condition_dim(self):
         expected_shape = inputs_dict["sample"].shape
         self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
-    def test_model_with_extra_film_use_concat(self):
+    def test_model_with_class_embeddings_concat(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 
         batch_size, _, _, sample_size = inputs_dict["sample"].shape
-        init_dict["extra_film_condition_dim"] = sample_size
-        init_dict["extra_film_use_concat"] = True
+
+        init_dict["class_embed_type"] = "simple_projection"
+        init_dict["projection_class_embeddings_input_dim"] = sample_size
+        init_dict["class_embeddings_concat"] = True
+
         inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
 
         model = self.model_class(**init_dict)

From e66476e26c188d0f8a164963068f0d8b49c6710f Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 27 Feb 2023 10:48:34 +0100
Subject: [PATCH 37/66] fix: make style

---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index d855ef4a1f61..d001f175f7be 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -17,13 +17,11 @@
 
 import torch
 import torch.nn.functional as F
-from packaging import version
 from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
 
-from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_available, logging, randn_tensor, replace_example_docstring
+from ...utils import is_accelerate_available, logging, randn_tensor, replace_example_docstring
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 
 
@@ -81,7 +79,6 @@ def __init__(
     ):
         super().__init__()
 
-
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -134,7 +131,7 @@ def _execution_device(self):
         `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         hooks.
         """
-        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+        if not hasattr(self.unet, "_hf_hook"):
             return self.device
         for module in self.unet.modules():
             if (

From 9f776a2845abd457a6e471e1ef4aacb4c9005ba9 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 27 Feb 2023 10:52:08 +0100
Subject: [PATCH 38/66] fix: transformers import and version in init

---
 src/diffusers/pipelines/audioldm/__init__.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/audioldm/__init__.py b/src/diffusers/pipelines/audioldm/__init__.py
index 719bbbce847c..80b11a259ef4 100644
--- a/src/diffusers/pipelines/audioldm/__init__.py
+++ b/src/diffusers/pipelines/audioldm/__init__.py
@@ -1 +1,16 @@
-from .pipeline_audioldm import AudioLDMPipeline
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        AudioLDMPipeline,
+    )
+else:
+    from .pipeline_audioldm import AudioLDMPipeline

From 5d6d1f8b324f5583e7805dc01e2c86e493660d66 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 27 Feb 2023 11:09:19 +0100
Subject: [PATCH 39/66] clean-up: make style

---
 .../convert_original_audioldm_to_diffusers.py | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 0adf0cf51280..389771960bc7 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -23,8 +23,13 @@
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+        "--checkpoint_path",
+        default="/Users/sanchitgandhi/convert-audioldm/ldm_trimmed.ckpt",
+        type=str,
+        required=False,  # TODO: revert to True
+        help="Path to the checkpoint to convert.",
     )
+    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
     parser.add_argument(
         "--original_config_file",
         default=None,
@@ -43,6 +48,12 @@
         type=str,
         help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
     )
+    parser.add_argument(
+        "--pipeline_type",
+        default=None,
+        type=str,
+        help="The pipeline type. If `None` pipeline will be automatically inferred.",
+    )
     parser.add_argument(
         "--image_size",
         default=None,
@@ -57,7 +68,7 @@
     )
     parser.add_argument(
         "--extract_ema",
-        action="store_true",
+        action="store_false",  # TODO: revert to store_true
         help=(
             "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
             " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
@@ -74,7 +85,13 @@
         action="store_true",
         help="Whether to store pipeline in safetensors format or not.",
     )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--dump_path",
+        default="/Users/sanchitgandhi/convert-audioldm/diffusers_out_3",
+        type=str,
+        required=False,  # TODO: revert to True
+        help="Path to the output model.",
+    )
     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
     args = parser.parse_args()
 

From 8b4ea0751472314c7caacc803ca63258949ae0db Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 27 Feb 2023 11:09:39 +0100
Subject: [PATCH 40/66] Revert "clean-up: make style"

This reverts commit 5d6d1f8b324f5583e7805dc01e2c86e493660d66.
---
 .../convert_original_audioldm_to_diffusers.py | 23 +++----------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 389771960bc7..0adf0cf51280 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -23,13 +23,8 @@
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
-        "--checkpoint_path",
-        default="/Users/sanchitgandhi/convert-audioldm/ldm_trimmed.ckpt",
-        type=str,
-        required=False,  # TODO: revert to True
-        help="Path to the checkpoint to convert.",
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
     )
-    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
     parser.add_argument(
         "--original_config_file",
         default=None,
@@ -48,12 +43,6 @@
         type=str,
         help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
     )
-    parser.add_argument(
-        "--pipeline_type",
-        default=None,
-        type=str,
-        help="The pipeline type. If `None` pipeline will be automatically inferred.",
-    )
     parser.add_argument(
         "--image_size",
         default=None,
@@ -68,7 +57,7 @@
     )
     parser.add_argument(
         "--extract_ema",
-        action="store_false",  # TODO: revert to store_true
+        action="store_true",
         help=(
             "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
             " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
@@ -85,13 +74,7 @@
         action="store_true",
         help="Whether to store pipeline in safetensors format or not.",
     )
-    parser.add_argument(
-        "--dump_path",
-        default="/Users/sanchitgandhi/convert-audioldm/diffusers_out_3",
-        type=str,
-        required=False,  # TODO: revert to True
-        help="Path to the output model.",
-    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
     args = parser.parse_args()
 

From 876f241ca6956ba5705cc3d93289c982bc149774 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 27 Feb 2023 11:10:25 +0100
Subject: [PATCH 41/66] clean-up: make style

---
 src/diffusers/pipelines/audioldm/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/audioldm/__init__.py b/src/diffusers/pipelines/audioldm/__init__.py
index 80b11a259ef4..8ddef6c3f325 100644
--- a/src/diffusers/pipelines/audioldm/__init__.py
+++ b/src/diffusers/pipelines/audioldm/__init__.py
@@ -5,6 +5,7 @@
     is_transformers_version,
 )
 
+
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
         raise OptionalDependencyNotAvailable()

From dfc1c859c9cf978f8b6169173b5b3a2f321c174d Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 27 Feb 2023 13:59:25 +0100
Subject: [PATCH 42/66] clean-up: use pipeline tester mixin tests where poss

---
 tests/pipelines/audioldm/test_audioldm.py | 57 +----------------------
 1 file changed, 2 insertions(+), 55 deletions(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 387144e9062f..f7bd2acae216 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -522,63 +522,10 @@ def test_audioldm_width_opt(self):
         assert audio_shape == (1, 256)
 
     def test_attention_slicing_forward_pass(self):
-        # override this test since we want to compare 1-d audio waveforms (not 3d pixel arrays)
-        if not self.test_attention_slicing:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        # Warmup pass when using mps (see #372)
-        if torch_device == "mps":
-            _ = pipe(**self.get_dummy_inputs(torch_device))
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_without_slicing = pipe(**inputs).audios
-
-        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs(torch_device)
-        output_with_slicing = pipe(**inputs).audios
-
-        max_diff = np.abs(output_with_slicing - output_without_slicing).max()
-        self.assertLess(max_diff, 1e-3, "Attention slicing should not affect the inference results")
+        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
 
     def test_inference_batch_single_identical(self):
-        # override this test since we want to compare 1-d audio waveforms (not 3d pixel arrays)
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-
-        # run single-sample inference
-        inputs["generator"] = self.get_generator(0)
-        output = audioldm_pipe(**inputs).audios
-
-        batch_size = 3
-        batched_inputs = {}
-
-        # make unequal batch sizes
-        batched_inputs["prompt"] = [inputs["prompt"][: len(inputs["prompt"]) // i] for i in range(1, batch_size + 1)]
-        # make last batch super long
-        batched_inputs["prompt"][-1] = 2000 * "very long"
-        # set the generator
-        batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
-        # duplicate any remaining inputs
-        for named_input in inputs:
-            if named_input not in batched_inputs:
-                batched_inputs[named_input] = inputs[named_input]
-
-        # run batched inference
-        output_batch = audioldm_pipe(**batched_inputs).audios
-        assert output_batch.shape[0] == batch_size
-
-        max_diff = np.abs(output_batch[0] - output[0]).max()
-        assert max_diff < 1e-4
+        self._test_inference_batch_single_identical(test_mean_pixel_difference=False)
 
 
 @slow

From ad80911426cc099c20877f6e8d89e2d6e9c2f72a Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Mon, 27 Feb 2023 13:59:39 +0100
Subject: [PATCH 43/66] clean-up: skip attn slicing test

---
 tests/pipelines/audioldm/test_audioldm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index f7bd2acae216..db143386e7a1 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -600,8 +600,8 @@ def test_audioldm_dpm(self):
         max_diff = np.abs(expected_slice - audio_slice).max()
         assert max_diff < 1e-3
 
+    @unittest.skip("TODO(SG): fix or remove. This test yields the same memory for with / without attn slicing")
     def test_audioldm_attention_slicing(self):
-        # TODO(SG): fix or remove. This test yields the same memory for with / without attn slicing
         torch.cuda.reset_peak_memory_stats()
         pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)

From 68cc47e9b3697c9bac8632ba3a1581800cec87ec Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 09:39:02 +0100
Subject: [PATCH 44/66] fix: add torch dtype to docs

---
 docs/source/en/api/pipelines/audioldm.mdx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/audioldm.mdx b/docs/source/en/api/pipelines/audioldm.mdx
index 385f622abc02..4a3adeede05a 100644
--- a/docs/source/en/api/pipelines/audioldm.mdx
+++ b/docs/source/en/api/pipelines/audioldm.mdx
@@ -63,13 +63,14 @@ method, or pass the `scheduler` argument to the `from_pretrained` method of the
 
 ```python
 >>> from diffusers import AudioLDMPipeline, DPMSolverMultistepScheduler
+>>> import torch
 
->>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
+>>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
 >>> pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
 
 >>> # or
 >>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained("cvssp/audioldm", subfolder="scheduler")
->>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm", scheduler=dpm_scheduler)
+>>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm", scheduler=dpm_scheduler, torch_dtype=torch.float16)
 ```
 
 ## AudioLDMPipeline

From 6bc6a75fb54d4b0ecb35e021f0dfedc248ba0132 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 09:42:41 +0100
Subject: [PATCH 45/66] fix: remove conversion script out of src

---
 .../convert_original_audioldm_to_diffusers.py | 928 ++++++++++++++++-
 .../pipelines/audioldm/convert_from_ckpt.py   | 942 ------------------
 2 files changed, 926 insertions(+), 944 deletions(-)
 delete mode 100644 src/diffusers/pipelines/audioldm/convert_from_ckpt.py

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 0adf0cf51280..53dd4fc1eec0 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -12,11 +12,935 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conversion script for the AudioLDM checkpoints. """
+""" Conversion script for the AudioLDM checkpoints."""
 
 import argparse
+import re
 
-from diffusers.pipelines.audioldm.convert_from_ckpt import load_pipeline_from_original_audioldm_ckpt
+import torch
+from transformers import (
+    AutoTokenizer,
+    ClapTextConfig,
+    ClapTextModelWithProjection,
+    SpeechT5HifiGan,
+    SpeechT5HifiGanConfig,
+)
+
+from diffusers import (
+    AudioLDMPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+
+from ...utils import is_omegaconf_available, is_safetensors_available
+from ...utils.import_utils import BACKENDS_MAPPING
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
+def renew_attention_paths(old_list):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_attention_paths
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a UNet config for diffusers based on the config of the original AudioLDM model.
+    """
+    unet_params = original_config.model.params.unet_config.params
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+
+    cross_attention_dim = (
+        unet_params.cross_attention_dim if "cross_attention_dim" in unet_params else block_out_channels
+    )
+
+    class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None
+    projection_class_embeddings_input_dim = (
+        unet_params.extra_film_condition_dim if "extra_film_condition_dim" in unet_params else None
+    )
+    class_embeddings_concat = unet_params.extra_film_use_concat if "extra_film_use_concat" in unet_params else None
+
+    config = dict(
+        sample_size=image_size // vae_scale_factor,
+        in_channels=unet_params.in_channels,
+        out_channels=unet_params.out_channels,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=unet_params.num_res_blocks,
+        cross_attention_dim=cross_attention_dim,
+        class_embed_type=class_embed_type,
+        projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+        class_embeddings_concat=class_embeddings_concat,
+    )
+
+    return config
+
+
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
+def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
+    """
+    Creates a VAE config for diffusers based on the config of the original AudioLDM model. Compared to the original
+    Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    _ = original_config.model.params.first_stage_config.params.embed_dim
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config.model.params else 0.18215
+
+    config = dict(
+        sample_size=image_size,
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+        scaling_factor=float(scaling_factor),
+    )
+    return config
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_unet_checkpoint
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint. Compared to the original Stable Diffusion
+    conversion, this function additionally converts the learnt film embedding linear layer.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["class_embedding.weight"] = unet_state_dict["film_emb.weight"]
+    new_checkpoint["class_embedding.bias"] = unet_state_dict["film_emb.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_vae_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+CLAP_KEYS_TO_MODIFY_MAPPING = {
+    "text_branch": "text_model",
+    "attn": "attention.self",
+    "self.proj": "output.dense",
+    "attention.self_mask": "attn_mask",
+    "mlp.fc1": "intermediate.dense",
+    "mlp.fc2": "output.dense",
+    "norm1": "layernorm_before",
+    "norm2": "layernorm_after",
+    "bn0": "batch_norm",
+}
+
+CLAP_KEYS_TO_IGNORE = ["text_transform"]
+
+CLAP_EXPECTED_MISSING_KEYS = ["text_model.embeddings.token_type_ids"]
+
+
+def convert_open_clap_checkpoint(checkpoint):
+    """
+    Takes a state dict and returns a converted CLAP checkpoint.
+    """
+    # extract state dict for CLAP text embedding model, discarding the audio component
+    model_state_dict = {}
+    model_key = "cond_stage_model.model.text_"
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(model_key):
+            model_state_dict[key.replace(model_key, "text_")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    sequential_layers_pattern = r".*sequential.(\d+).*"
+    text_projection_pattern = r".*_projection.(\d+).*"
+
+    for key, value in model_state_dict.items():
+        # check if key should be ignored in mapping
+        if key.split(".")[0] in CLAP_KEYS_TO_IGNORE:
+            continue
+
+        # check if any key needs to be modified
+        for key_to_modify, new_key in CLAP_KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        if re.match(sequential_layers_pattern, key):
+            # replace sequential layers with list
+            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
+
+            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
+        elif re.match(text_projection_pattern, key):
+            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
+
+            # Because in CLAP they use `nn.Sequential`...
+            transformers_projection_layer = 1 if projecton_layer == 0 else 2
+
+            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
+
+        if "audio" and "qkv" in key:
+            # split qkv into query key and value
+            mixed_qkv = value
+            qkv_dim = mixed_qkv.size(0) // 3
+
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+
+            new_checkpoint[key.replace("qkv", "query")] = query_layer
+            new_checkpoint[key.replace("qkv", "key")] = key_layer
+            new_checkpoint[key.replace("qkv", "value")] = value_layer
+        else:
+            new_checkpoint[key] = value
+
+    return new_checkpoint
+
+
+def create_transformers_vocoder_config(original_config):
+    """
+    Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
+    """
+    vocoder_params = original_config.model.params.vocoder_config.params
+
+    config = dict(
+        model_in_dim=vocoder_params.num_mels,
+        sampling_rate=vocoder_params.sampling_rate,
+        upsample_initial_channel=vocoder_params.upsample_initial_channel,
+        upsample_rates=list(vocoder_params.upsample_rates),
+        upsample_kernel_sizes=list(vocoder_params.upsample_kernel_sizes),
+        resblock_kernel_sizes=list(vocoder_params.resblock_kernel_sizes),
+        resblock_dilation_sizes=[
+            list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes
+        ],
+        normalize_before=False,
+    )
+
+    return config
+
+
+def convert_hifigan_checkpoint(checkpoint, config):
+    """
+    Takes a state dict and config, and returns a converted HiFiGAN vocoder checkpoint.
+    """
+    # extract state dict for vocoder
+    vocoder_state_dict = {}
+    vocoder_key = "first_stage_model.vocoder."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vocoder_key):
+            vocoder_state_dict[key.replace(vocoder_key, "")] = checkpoint.get(key)
+
+    # fix upsampler keys, everything else is correct already
+    for i in range(len(config.upsample_rates)):
+        vocoder_state_dict[f"upsampler.{i}.weight"] = vocoder_state_dict.pop(f"ups.{i}.weight")
+        vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
+
+    if not config.normalize_before:
+        # if we don't set normalize_before then these variables are unused, so we set them to their initialised values
+        vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
+        vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
+
+    return vocoder_state_dict
+
+
+# Adapted from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/audioldm/utils.py#L72-L73
+DEFAULT_CONFIG = {
+    "model": {
+        "params": {
+            "linear_start": 0.0015,
+            "linear_end": 0.0195,
+            "timesteps": 1000,
+            "channels": 8,
+            "scale_by_std": True,
+            "unet_config": {
+                "target": "audioldm.latent_diffusion.openaimodel.UNetModel",
+                "params": {
+                    "extra_film_condition_dim": 512,
+                    "extra_film_use_concat": True,
+                    "in_channels": 8,
+                    "out_channels": 8,
+                    "model_channels": 128,
+                    "attention_resolutions": [8, 4, 2],
+                    "num_res_blocks": 2,
+                    "channel_mult": [1, 2, 3, 5],
+                    "num_head_channels": 32,
+                },
+            },
+            "first_stage_config": {
+                "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
+                "params": {
+                    "embed_dim": 8,
+                    "ddconfig": {
+                        "z_channels": 8,
+                        "resolution": 256,
+                        "in_channels": 1,
+                        "out_ch": 1,
+                        "ch": 128,
+                        "ch_mult": [1, 2, 4],
+                        "num_res_blocks": 2,
+                    },
+                },
+            },
+            "vocoder_config": {
+                "target": "audioldm.first_stage_model.vocoder",
+                "params": {
+                    "upsample_rates": [5, 4, 2, 2, 2],
+                    "upsample_kernel_sizes": [16, 16, 8, 4, 4],
+                    "upsample_initial_channel": 1024,
+                    "resblock_kernel_sizes": [3, 7, 11],
+                    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    "num_mels": 64,
+                    "sampling_rate": 16000,
+                },
+            },
+        },
+    },
+}
+
+
+def load_pipeline_from_original_audioldm_ckpt(
+    checkpoint_path: str,
+    original_config_file: str = None,
+    image_size: int = 512,
+    prediction_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "ddim",
+    num_in_channels: int = None,
+    device: str = None,
+    from_safetensors: bool = False,
+) -> AudioLDMPipeline:
+    """
+    Load an AudioLDM pipeline object from a `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    :param checkpoint_path: Path to `.ckpt` file. :param original_config_file: Path to `.yaml` config file
+    corresponding to the original architecture.
+            If `None`, will be automatically instantiated based on default values.
+    :param image_size: The image size that the model was trained on. Use 512 for original AudioLDM checkpoints. :param
+    prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for original
+            AudioLDM checkpoints.
+    :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
+            inferred.
+    :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
+            "euler-ancestral", "dpm", "ddim"]`.
+    :param extract_ema: Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract
+            the EMA weights or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually
+            yield higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
+    :param device: The device to use. Pass `None` to determine automatically. :param from_safetensors: If
+    `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors
+            instead of PyTorch.
+    :return: An AudioLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+
+    if not is_omegaconf_available():
+        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
+
+    from omegaconf import OmegaConf
+
+    if from_safetensors:
+        if not is_safetensors_available():
+            raise ValueError(BACKENDS_MAPPING["safetensors"][1])
+
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    if "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    if original_config_file is None:
+        original_config = DEFAULT_CONFIG
+        original_config = OmegaConf.create(original_config)
+    else:
+        original_config = OmegaConf.load(original_config_file)
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            prediction_type = "v_prediction"
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+
+    if image_size is None:
+        image_size = 512
+
+    num_train_timesteps = original_config.model.params.timesteps
+    beta_start = original_config.model.params.linear_start
+    beta_end = original_config.model.params.linear_end
+
+    scheduler = DDIMScheduler(
+        beta_end=beta_end,
+        beta_schedule="scaled_linear",
+        beta_start=beta_start,
+        num_train_timesteps=num_train_timesteps,
+        steps_offset=1,
+        clip_sample=False,
+        set_alpha_to_one=False,
+        prediction_type=prediction_type,
+    )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DModel
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet = UNet2DConditionModel(**unet_config)
+
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model
+    vae_config = create_vae_diffusers_config(original_config, checkpoint=checkpoint, image_size=image_size)
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+
+    # Convert the text model
+    # AudioLDM uses the same configuration and tokenizer as the original CLAP model
+    config = ClapTextConfig.from_pretrained("laion/clap-htsat-unfused")
+    tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+
+    converted_text_model = convert_open_clap_checkpoint(checkpoint)
+    text_model = ClapTextModelWithProjection(config)
+
+    missing_keys, unexpected_keys = text_model.load_state_dict(converted_text_model, strict=False)
+    # we expect not to have token_type_ids in our original state dict so let's ignore them
+    missing_keys = list(set(missing_keys) - set(CLAP_EXPECTED_MISSING_KEYS))
+
+    if len(unexpected_keys) > 0:
+        raise ValueError(f"Unexpected keys when loading CLAP model: {unexpected_keys}")
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing keys when loading CLAP model: {missing_keys}")
+
+    # Convert the vocoder model
+    vocoder_config = create_transformers_vocoder_config(original_config)
+    vocoder_config = SpeechT5HifiGanConfig(**vocoder_config)
+    converted_vocoder_checkpoint = convert_hifigan_checkpoint(checkpoint, vocoder_config)
+
+    vocoder = SpeechT5HifiGan(vocoder_config)
+    vocoder.load_state_dict(converted_vocoder_checkpoint)
+
+    # Instantiate the diffusers pipeline
+    pipe = AudioLDMPipeline(
+        vae=vae,
+        text_encoder=text_model,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=scheduler,
+        vocoder=vocoder,
+    )
+
+    return pipe
 
 
 if __name__ == "__main__":
diff --git a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py b/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
deleted file mode 100644
index 4f0df0e418d6..000000000000
--- a/src/diffusers/pipelines/audioldm/convert_from_ckpt.py
+++ /dev/null
@@ -1,942 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conversion script for the AudioLDM checkpoints."""
-
-import re
-
-import torch
-from transformers import (
-    AutoTokenizer,
-    ClapTextConfig,
-    ClapTextModelWithProjection,
-    SpeechT5HifiGan,
-    SpeechT5HifiGanConfig,
-)
-
-from diffusers import (
-    AudioLDMPipeline,
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    HeunDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-
-from ...utils import is_omegaconf_available, is_safetensors_available
-from ...utils.import_utils import BACKENDS_MAPPING
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_resnet_paths
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_attention_paths
-def renew_attention_paths(old_list):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
-
-        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
-
-        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_attention_paths
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "query.weight")
-        new_item = new_item.replace("q.bias", "query.bias")
-
-        new_item = new_item.replace("k.weight", "key.weight")
-        new_item = new_item.replace("k.bias", "key.bias")
-
-        new_item = new_item.replace("v.weight", "value.weight")
-        new_item = new_item.replace("v.bias", "value.bias")
-
-        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
-    attention layers, and takes into account additional replacements that may arise.
-
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = old_tensor.split(channels // num_heads, dim=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_unet_diffusers_config(original_config, image_size: int):
-    """
-    Creates a UNet config for diffusers based on the config of the original AudioLDM model.
-    """
-    unet_params = original_config.model.params.unet_config.params
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
-
-    cross_attention_dim = (
-        unet_params.cross_attention_dim if "cross_attention_dim" in unet_params else block_out_channels
-    )
-
-    class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None
-    projection_class_embeddings_input_dim = (
-        unet_params.extra_film_condition_dim if "extra_film_condition_dim" in unet_params else None
-    )
-    class_embeddings_concat = unet_params.extra_film_use_concat if "extra_film_use_concat" in unet_params else None
-
-    config = dict(
-        sample_size=image_size // vae_scale_factor,
-        in_channels=unet_params.in_channels,
-        out_channels=unet_params.out_channels,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=cross_attention_dim,
-        class_embed_type=class_embed_type,
-        projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
-        class_embeddings_concat=class_embeddings_concat,
-    )
-
-    return config
-
-
-# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_vae_diffusers_config
-def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
-    """
-    Creates a VAE config for diffusers based on the config of the original AudioLDM model. Compared to the original
-    Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
-    """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
-
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-
-    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config.model.params else 0.18215
-
-    config = dict(
-        sample_size=image_size,
-        in_channels=vae_params.in_channels,
-        out_channels=vae_params.out_ch,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks,
-        scaling_factor=float(scaling_factor),
-    )
-    return config
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
-def create_diffusers_schedular(original_config):
-    schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear",
-    )
-    return schedular
-
-
-# Adapted from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_unet_checkpoint
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint. Compared to the original Stable Diffusion
-    conversion, this function additionally converts the learnt film embedding linear layer.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    keys = list(checkpoint.keys())
-
-    unet_key = "model.diffusion_model."
-    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
-        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
-        print(
-            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-        )
-        for key in keys:
-            if key.startswith("model.diffusion_model"):
-                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-    else:
-        if sum(k.startswith("model_ema") for k in keys) > 100:
-            print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-            )
-
-        for key in keys:
-            if key.startswith(unet_key):
-                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-
-    new_checkpoint["class_embedding.weight"] = unet_state_dict["film_emb.weight"]
-    new_checkpoint["class_embedding.bias"] = unet_state_dict["film_emb.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-    input_blocks = {
-        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-        for layer_id in range(num_input_blocks)
-    }
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-    middle_blocks = {
-        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-        for layer_id in range(num_middle_blocks)
-    }
-
-    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-    output_blocks = {
-        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-        for layer_id in range(num_output_blocks)
-    }
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-
-        resnets = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-        ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-
-        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.weight"
-            )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.bias"
-            )
-
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-
-    for i in range(num_output_blocks):
-        block_id = i // (config["layers_per_block"] + 1)
-        layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-            resnet_0_paths = renew_resnet_paths(resnets)
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
-            if ["conv.bias", "conv.weight"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    "old": f"output_blocks.{i}.1",
-                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-                }
-                assign_to_checkpoint(
-                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-                )
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-
-    return new_checkpoint
-
-
-# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.convert_ldm_vae_checkpoint
-def convert_ldm_vae_checkpoint(checkpoint, config):
-    # extract state dict for VAE
-    vae_state_dict = {}
-    vae_key = "first_stage_model."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(vae_key):
-            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-CLAP_KEYS_TO_MODIFY_MAPPING = {
-    "text_branch": "text_model",
-    "attn": "attention.self",
-    "self.proj": "output.dense",
-    "attention.self_mask": "attn_mask",
-    "mlp.fc1": "intermediate.dense",
-    "mlp.fc2": "output.dense",
-    "norm1": "layernorm_before",
-    "norm2": "layernorm_after",
-    "bn0": "batch_norm",
-}
-
-CLAP_KEYS_TO_IGNORE = ["text_transform"]
-
-CLAP_EXPECTED_MISSING_KEYS = ["text_model.embeddings.token_type_ids"]
-
-
-def convert_open_clap_checkpoint(checkpoint):
-    """
-    Takes a state dict and returns a converted CLAP checkpoint.
-    """
-    # extract state dict for CLAP text embedding model, discarding the audio component
-    model_state_dict = {}
-    model_key = "cond_stage_model.model.text_"
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(model_key):
-            model_state_dict[key.replace(model_key, "text_")] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    sequential_layers_pattern = r".*sequential.(\d+).*"
-    text_projection_pattern = r".*_projection.(\d+).*"
-
-    for key, value in model_state_dict.items():
-        # check if key should be ignored in mapping
-        if key.split(".")[0] in CLAP_KEYS_TO_IGNORE:
-            continue
-
-        # check if any key needs to be modified
-        for key_to_modify, new_key in CLAP_KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-
-        if re.match(sequential_layers_pattern, key):
-            # replace sequential layers with list
-            sequential_layer = re.match(sequential_layers_pattern, key).group(1)
-
-            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
-        elif re.match(text_projection_pattern, key):
-            projecton_layer = int(re.match(text_projection_pattern, key).group(1))
-
-            # Because in CLAP they use `nn.Sequential`...
-            transformers_projection_layer = 1 if projecton_layer == 0 else 2
-
-            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")
-
-        if "audio" and "qkv" in key:
-            # split qkv into query key and value
-            mixed_qkv = value
-            qkv_dim = mixed_qkv.size(0) // 3
-
-            query_layer = mixed_qkv[:qkv_dim]
-            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
-            value_layer = mixed_qkv[qkv_dim * 2 :]
-
-            new_checkpoint[key.replace("qkv", "query")] = query_layer
-            new_checkpoint[key.replace("qkv", "key")] = key_layer
-            new_checkpoint[key.replace("qkv", "value")] = value_layer
-        else:
-            new_checkpoint[key] = value
-
-    return new_checkpoint
-
-
-def create_transformers_vocoder_config(original_config):
-    """
-    Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
-    """
-    vocoder_params = original_config.model.params.vocoder_config.params
-
-    config = dict(
-        model_in_dim=vocoder_params.num_mels,
-        sampling_rate=vocoder_params.sampling_rate,
-        upsample_initial_channel=vocoder_params.upsample_initial_channel,
-        upsample_rates=list(vocoder_params.upsample_rates),
-        upsample_kernel_sizes=list(vocoder_params.upsample_kernel_sizes),
-        resblock_kernel_sizes=list(vocoder_params.resblock_kernel_sizes),
-        resblock_dilation_sizes=[
-            list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes
-        ],
-        normalize_before=False,
-    )
-
-    return config
-
-
-def convert_hifigan_checkpoint(checkpoint, config):
-    """
-    Takes a state dict and config, and returns a converted HiFiGAN vocoder checkpoint.
-    """
-    # extract state dict for vocoder
-    vocoder_state_dict = {}
-    vocoder_key = "first_stage_model.vocoder."
-    keys = list(checkpoint.keys())
-    for key in keys:
-        if key.startswith(vocoder_key):
-            vocoder_state_dict[key.replace(vocoder_key, "")] = checkpoint.get(key)
-
-    # fix upsampler keys, everything else is correct already
-    for i in range(len(config.upsample_rates)):
-        vocoder_state_dict[f"upsampler.{i}.weight"] = vocoder_state_dict.pop(f"ups.{i}.weight")
-        vocoder_state_dict[f"upsampler.{i}.bias"] = vocoder_state_dict.pop(f"ups.{i}.bias")
-
-    if not config.normalize_before:
-        # if we don't set normalize_before then these variables are unused, so we set them to their initialised values
-        vocoder_state_dict["mean"] = torch.zeros(config.model_in_dim)
-        vocoder_state_dict["scale"] = torch.ones(config.model_in_dim)
-
-    return vocoder_state_dict
-
-
-# Adapted from https://huggingface.co/spaces/haoheliu/audioldm-text-to-audio-generation/blob/84a0384742a22bd80c44e903e241f0623e874f1d/audioldm/utils.py#L72-L73
-DEFAULT_CONFIG = {
-    "model": {
-        "params": {
-            "linear_start": 0.0015,
-            "linear_end": 0.0195,
-            "timesteps": 1000,
-            "channels": 8,
-            "scale_by_std": True,
-            "unet_config": {
-                "target": "audioldm.latent_diffusion.openaimodel.UNetModel",
-                "params": {
-                    "extra_film_condition_dim": 512,
-                    "extra_film_use_concat": True,
-                    "in_channels": 8,
-                    "out_channels": 8,
-                    "model_channels": 128,
-                    "attention_resolutions": [8, 4, 2],
-                    "num_res_blocks": 2,
-                    "channel_mult": [1, 2, 3, 5],
-                    "num_head_channels": 32,
-                },
-            },
-            "first_stage_config": {
-                "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
-                "params": {
-                    "embed_dim": 8,
-                    "ddconfig": {
-                        "z_channels": 8,
-                        "resolution": 256,
-                        "in_channels": 1,
-                        "out_ch": 1,
-                        "ch": 128,
-                        "ch_mult": [1, 2, 4],
-                        "num_res_blocks": 2,
-                    },
-                },
-            },
-            "vocoder_config": {
-                "target": "audioldm.first_stage_model.vocoder",
-                "params": {
-                    "upsample_rates": [5, 4, 2, 2, 2],
-                    "upsample_kernel_sizes": [16, 16, 8, 4, 4],
-                    "upsample_initial_channel": 1024,
-                    "resblock_kernel_sizes": [3, 7, 11],
-                    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-                    "num_mels": 64,
-                    "sampling_rate": 16000,
-                },
-            },
-        },
-    },
-}
-
-
-def load_pipeline_from_original_audioldm_ckpt(
-    checkpoint_path: str,
-    original_config_file: str = None,
-    image_size: int = 512,
-    prediction_type: str = None,
-    extract_ema: bool = False,
-    scheduler_type: str = "ddim",
-    num_in_channels: int = None,
-    device: str = None,
-    from_safetensors: bool = False,
-) -> AudioLDMPipeline:
-    """
-    Load an AudioLDM pipeline object from a `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file.
-
-    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
-    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
-    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
-
-    :param checkpoint_path: Path to `.ckpt` file. :param original_config_file: Path to `.yaml` config file
-    corresponding to the original architecture.
-            If `None`, will be automatically instantiated based on default values.
-    :param image_size: The image size that the model was trained on. Use 512 for original AudioLDM checkpoints. :param
-    prediction_type: The prediction type that the model was trained on. Use `'epsilon'` for original
-            AudioLDM checkpoints.
-    :param num_in_channels: The number of input channels. If `None` number of input channels will be automatically
-            inferred.
-    :param scheduler_type: Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler",
-            "euler-ancestral", "dpm", "ddim"]`.
-    :param extract_ema: Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract
-            the EMA weights or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually
-            yield higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning.
-    :param device: The device to use. Pass `None` to determine automatically. :param from_safetensors: If
-    `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors
-            instead of PyTorch.
-    :return: An AudioLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
-    """
-
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
-    if from_safetensors:
-        if not is_safetensors_available():
-            raise ValueError(BACKENDS_MAPPING["safetensors"][1])
-
-        from safetensors import safe_open
-
-        checkpoint = {}
-        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
-            for key in f.keys():
-                checkpoint[key] = f.get_tensor(key)
-    else:
-        if device is None:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            checkpoint = torch.load(checkpoint_path, map_location=device)
-        else:
-            checkpoint = torch.load(checkpoint_path, map_location=device)
-
-    if "state_dict" in checkpoint:
-        checkpoint = checkpoint["state_dict"]
-
-    if original_config_file is None:
-        original_config = DEFAULT_CONFIG
-        original_config = OmegaConf.create(original_config)
-    else:
-        original_config = OmegaConf.load(original_config_file)
-
-    if num_in_channels is not None:
-        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
-
-    if (
-        "parameterization" in original_config["model"]["params"]
-        and original_config["model"]["params"]["parameterization"] == "v"
-    ):
-        if prediction_type is None:
-            prediction_type = "v_prediction"
-    else:
-        if prediction_type is None:
-            prediction_type = "epsilon"
-
-    if image_size is None:
-        image_size = 512
-
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
-
-    scheduler = DDIMScheduler(
-        beta_end=beta_end,
-        beta_schedule="scaled_linear",
-        beta_start=beta_start,
-        num_train_timesteps=num_train_timesteps,
-        steps_offset=1,
-        clip_sample=False,
-        set_alpha_to_one=False,
-        prediction_type=prediction_type,
-    )
-    # make sure scheduler works correctly with DDIM
-    scheduler.register_to_config(clip_sample=False)
-
-    if scheduler_type == "pndm":
-        config = dict(scheduler.config)
-        config["skip_prk_steps"] = True
-        scheduler = PNDMScheduler.from_config(config)
-    elif scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "heun":
-        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
-    elif scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-    elif scheduler_type == "ddim":
-        scheduler = scheduler
-    else:
-        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
-
-    # Convert the UNet2DModel
-    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
-    unet = UNet2DConditionModel(**unet_config)
-
-    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
-    )
-
-    unet.load_state_dict(converted_unet_checkpoint)
-
-    # Convert the VAE model
-    vae_config = create_vae_diffusers_config(original_config, checkpoint=checkpoint, image_size=image_size)
-    converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-    vae = AutoencoderKL(**vae_config)
-    vae.load_state_dict(converted_vae_checkpoint)
-
-    # Convert the text model
-    # AudioLDM uses the same configuration and tokenizer as the original CLAP model
-    config = ClapTextConfig.from_pretrained("laion/clap-htsat-unfused")
-    tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
-
-    converted_text_model = convert_open_clap_checkpoint(checkpoint)
-    text_model = ClapTextModelWithProjection(config)
-
-    missing_keys, unexpected_keys = text_model.load_state_dict(converted_text_model, strict=False)
-    # we expect not to have token_type_ids in our original state dict so let's ignore them
-    missing_keys = list(set(missing_keys) - set(CLAP_EXPECTED_MISSING_KEYS))
-
-    if len(unexpected_keys) > 0:
-        raise ValueError(f"Unexpected keys when loading CLAP model: {unexpected_keys}")
-
-    if len(missing_keys) > 0:
-        raise ValueError(f"Missing keys when loading CLAP model: {missing_keys}")
-
-    # Convert the vocoder model
-    vocoder_config = create_transformers_vocoder_config(original_config)
-    vocoder_config = SpeechT5HifiGanConfig(**vocoder_config)
-    converted_vocoder_checkpoint = convert_hifigan_checkpoint(checkpoint, vocoder_config)
-
-    vocoder = SpeechT5HifiGan(vocoder_config)
-    vocoder.load_state_dict(converted_vocoder_checkpoint)
-
-    # Instantiate the diffusers pipeline
-    pipe = AudioLDMPipeline(
-        vae=vae,
-        text_encoder=text_model,
-        tokenizer=tokenizer,
-        unet=unet,
-        scheduler=scheduler,
-        vocoder=vocoder,
-    )
-
-    return pipe

From 99a338898515e57b524c97f53ef4803721cf15ce Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 09:44:02 +0100
Subject: [PATCH 46/66] fix: remove .detach from 1d waveform

---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index d001f175f7be..03e9018d5a49 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -292,7 +292,7 @@ def mel_spectrogram_to_waveform(self, mel_spectrogram):
 
         waveform = self.vocoder(mel_spectrogram)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.cpu().detach()
+        waveform = waveform.cpu()
         return waveform
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs

From ed9be20214179927dbfd1ff55079044d47a667c7 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 09:46:22 +0100
Subject: [PATCH 47/66] fix: reduce default num inf steps

---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 03e9018d5a49..6247723de60f 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -386,7 +386,7 @@ def __call__(
         prompt: Union[str, List[str]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
-        num_inference_steps: int = 200,
+        num_inference_steps: int = 10,
         guidance_scale: float = 2.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_waveforms_per_prompt: Optional[int] = 1,
@@ -412,7 +412,7 @@ def __call__(
                 spectrogram and thus longer audio sample.
             width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor // 8):
                 The width in pixels of the generated spectrogram.
-            num_inference_steps (`int`, *optional*, defaults to 200):
+            num_inference_steps (`int`, *optional*, defaults to 10):
                 The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
                 expense of slower inference.
             guidance_scale (`float`, *optional*, defaults to 2.5):

From 87755de295c91e517056fba968a6e6e978f514fb Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 11:34:26 +0100
Subject: [PATCH 48/66] fix: swap height/width -> audio_length_in_s

---
 .../pipelines/audioldm/pipeline_audioldm.py   | 45 ++++++++++---------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 6247723de60f..2e598aace6cf 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -15,6 +15,7 @@
 import inspect
 from typing import Callable, List, Optional, Union
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
@@ -313,19 +314,22 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
         self,
         prompt,
-        height,
-        width,
+        audio_length_in_s,
+        vocoder_upsample_factor,
         callback_steps,
         negative_prompt=None,
         prompt_embeds=None,
         negative_prompt_embeds=None,
     ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
+        if audio_length_in_s < min_audio_length_in_s:
+            raise ValueError(
+                f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
+                f"is {audio_length_in_s}."
+            )
 
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
@@ -361,9 +365,9 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents with width->self.vocoder.config.model_in_dim
+    def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, self.vocoder.config.model_in_dim // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -384,8 +388,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
+        audio_length_in_s: Optional[float] = None,
         num_inference_steps: int = 10,
         guidance_scale: float = 2.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -407,11 +410,8 @@ def __call__(
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide the audio generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated spectrogram. Using a larger height results in a longer
-                spectrogram and thus longer audio sample.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor // 8):
-                The width in pixels of the generated spectrogram.
+            audio_length_in_s (`int`, *optional*, defaults to 5.12):
+                The length of the generated audio sample in seconds.
             num_inference_steps (`int`, *optional*, defaults to 10):
                 The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
                 expense of slower inference.
@@ -465,13 +465,17 @@ def __call__(
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
             When returning a tuple, the first element is a list with the generated audios.
         """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor // 8
+        # 0. Convert audio input length from seconds to spectrogram height
+        vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
+
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
+
+        height = int(audio_length_in_s / vocoder_upsample_factor)
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+            prompt, audio_length_in_s, vocoder_upsample_factor, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
         )
 
         # 2. Define call parameters
@@ -509,14 +513,13 @@ def __call__(
             batch_size * num_waveforms_per_prompt,
             num_channels_latents,
             height,
-            width,
             prompt_embeds.dtype,
             device,
             generator,
             latents,
         )
 
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        # 6. Prepare extra step kwargs
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 7. Denoising loop

From 42294e59d0d9f5a5551880c148108fba303c86f7 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 11:35:30 +0100
Subject: [PATCH 49/66] clean-up: make style

---
 .../pipelines/audioldm/pipeline_audioldm.py       | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 2e598aace6cf..cbeac727a6d9 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -367,7 +367,12 @@ def check_inputs(
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents with width->self.vocoder.config.model_in_dim
     def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, self.vocoder.config.model_in_dim // self.vae_scale_factor)
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            self.vocoder.config.model_in_dim // self.vae_scale_factor,
+        )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -475,7 +480,13 @@ def __call__(
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
-            prompt, audio_length_in_s, vocoder_upsample_factor, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+            prompt,
+            audio_length_in_s,
+            vocoder_upsample_factor,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
         )
 
         # 2. Define call parameters

From 21d644816c50feed92d122bc2d28638cf1c32761 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 11:40:14 +0100
Subject: [PATCH 50/66] fix: remove nightly tests

---
 tests/pipelines/audioldm/test_audioldm.py | 70 -----------------------
 1 file changed, 70 deletions(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index db143386e7a1..570e7a9bc097 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -744,73 +744,3 @@ def test_audioldm_pipeline_with_sequential_cpu_offloading(self):
         mem_bytes = torch.cuda.max_memory_allocated()
         # make sure that less than 2.8 GB is allocated
         assert mem_bytes < 2.8 * 10**9
-
-
-@nightly
-@require_torch_gpu
-class AudioLDMPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "A hammer hitting a wooden table",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 5,
-            "guidance_scale": 2.5,
-        }
-        return inputs
-
-    def test_audioldm_ddim(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm").to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        audios = audioldm_pipe(**inputs).audios[0]
-
-        expected_audios = load_numpy("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main")
-        max_diff = np.abs(expected_audios - audios).max()
-        assert max_diff < 1e-3
-
-    def test_audioldm_lms(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm").to(torch_device)
-        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        audios = audioldm_pipe(**inputs).audios[0]
-
-        expected_audios = load_numpy("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main")
-        max_diff = np.abs(expected_audios - audios).max()
-        assert max_diff < 1e-3
-
-    def test_audioldm_euler(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm").to(torch_device)
-        audioldm_pipe.scheduler = EulerDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        audios = audioldm_pipe(**inputs).audios[0]
-
-        expected_audios = load_numpy("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main")
-        max_diff = np.abs(expected_audios - audios).max()
-        assert max_diff < 1e-3
-
-    def test_audioldm_dpm(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm").to(torch_device)
-        audioldm_pipe.scheduler = DPMSolverMultistepScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 25
-        audios = audioldm_pipe(**inputs).audios[0]
-
-        expected_audios = load_numpy("https://huggingface.co/datasets/diffusers/test-arrays/resolve/main")
-        max_diff = np.abs(expected_audios - audios).max()
-        assert max_diff < 1e-3

From 01f9ade8b954ce42b6a599c462bdd9c52c6e5b96 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 11:52:05 +0100
Subject: [PATCH 51/66] fix: imports in conversion script

---
 scripts/convert_original_audioldm_to_diffusers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 53dd4fc1eec0..5e350bffde67 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -39,8 +39,8 @@
     UNet2DConditionModel,
 )
 
-from ...utils import is_omegaconf_available, is_safetensors_available
-from ...utils.import_utils import BACKENDS_MAPPING
+from diffusers.utils import is_omegaconf_available, is_safetensors_available
+from diffusers.utils.import_utils import BACKENDS_MAPPING
 
 
 # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments

From a9faabbf9b8f4bce86c7fdb28085667964022df4 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 12:05:26 +0100
Subject: [PATCH 52/66] clean-up: slim-down to two slow tests

---
 tests/pipelines/audioldm/test_audioldm.py | 174 +---------------------
 1 file changed, 6 insertions(+), 168 deletions(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 570e7a9bc097..73a4857a1a5e 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -549,22 +549,22 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
         }
         return inputs
 
-    def test_audioldm_ddim(self):
+    def test_audioldm(self):
         audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
-        audioldm_pipe.scheduler = DDIMScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 25
         audio = audioldm_pipe(**inputs).audios[0]
 
         assert audio.ndim == 1
         assert len(audio) == 81952
 
-        audio_slice = audio[3880:3890]
-        expected_slice = np.array([-0.0574, 0.2462, 0.3955, 0.4213, 0.3901, 0.3770, 0.2762, 0.0206, -0.2208, -0.3282])
+        audio_slice = audio[77230:77240]
+        expected_slice = np.array([-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315])
         max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 1e-3
+        assert max_diff < 1e-2
 
     def test_audioldm_lms(self):
         audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
@@ -581,166 +581,4 @@ def test_audioldm_lms(self):
         audio_slice = audio[27780:27790]
         expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
         max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 1e-3
-
-    def test_audioldm_dpm(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
-        audioldm_pipe.scheduler = DPMSolverMultistepScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        audio = audioldm_pipe(**inputs).audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 81952
-
-        audio_slice = audio[69310:69320]
-        expected_slice = np.array([0.1842, 0.2411, 0.3127, 0.3069, 0.2287, 0.0948, -0.0071, -0.041, -0.1293, -0.2075])
-        max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 1e-3
-
-    @unittest.skip("TODO(SG): fix or remove. This test yields the same memory for with / without attn slicing")
-    def test_audioldm_attention_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
-        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        # enable attention slicing
-        pipe.enable_attention_slicing(slice_size="max")
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        audios_sliced = pipe(**inputs).audios
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 3.75 GB is allocated
-        assert mem_bytes < 3.75 * 10**9
-
-        # disable slicing
-        pipe.disable_attention_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        audios = pipe(**inputs).audios
-
-        # make sure that more than 3.75 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 3.75 * 10**9
-        assert np.abs(audios_sliced - audios).max() < 1e-3
-
-    def test_audioldm_vae_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
-        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        # enable vae slicing
-        pipe.enable_vae_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        inputs["prompt"] = [inputs["prompt"]] * 4
-        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
-        audio_sliced = pipe(**inputs).audios
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 1.1 GB is allocated
-        assert mem_bytes < 1.1 * 10**9
-
-        # disable vae slicing
-        pipe.disable_vae_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        inputs["prompt"] = [inputs["prompt"]] * 4
-        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
-        audio = pipe(**inputs).audios
-
-        # make sure that more than 1.1 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 1.1 * 10**9
-        # There is a small discrepancy at the spectrogram borders vs. a fully batched version.
-        assert np.abs(audio_sliced - audio).max() < 1e-2
-
-    def test_audioldm_fp16_vs_autocast(self):
-        # this test makes sure that the original model with autocast
-        # and the new model with fp16 yield the same result
-        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        audio_fp16 = pipe(**inputs).audios
-
-        with torch.autocast(torch_device):
-            inputs = self.get_inputs(torch_device)
-            audio_autocast = pipe(**inputs).audios
-
-        # Make sure results are close enough
-        diff = np.abs(audio_fp16.flatten() - audio_autocast.flatten())
-        # They ARE different since ops are not run always at the same precision
-        # however, they should be extremely close.
-        assert diff.mean() < 1e-3
-
-    def test_audioldm_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 8, 128, 16)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.6730, -0.9062, 1.0400, 0.4220, -0.9785, 1.817, 0.1906, -1.3430, 1.3330])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 8, 128, 16)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.6763, -0.9062, 1.0520, 0.4200, -0.9750, 1.8220, 0.1879, -1.3490, 1.3190])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-
-        callback_fn.has_been_called = False
-
-        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == inputs["num_inference_steps"]
-
-    def test_audioldm_low_cpu_mem_usage(self):
-        pipeline_id = "cvssp/audioldm"
-
-        start_time = time.time()
-        pipeline_low_cpu_mem_usage = AudioLDMPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
-        pipeline_low_cpu_mem_usage.to(torch_device)
-        low_cpu_mem_usage_time = time.time() - start_time
-
-        start_time = time.time()
-        _ = AudioLDMPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
-        normal_load_time = time.time() - start_time
-
-        assert 2 * low_cpu_mem_usage_time < normal_load_time
-
-    def test_audioldm_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.8 GB is allocated
-        assert mem_bytes < 2.8 * 10**9
+        assert max_diff < 1e-2

From 9f26689eddc7fc9ae78db36699d404bdaad3393f Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 12:31:15 +0100
Subject: [PATCH 53/66] clean-up: slim-down fast tests

---
 tests/pipelines/audioldm/test_audioldm.py | 206 +++-------------------
 1 file changed, 25 insertions(+), 181 deletions(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 73a4857a1a5e..574eca54982e 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -194,7 +194,7 @@ def test_audioldm_prompt_embeds(self):
         output = audioldm_pipe(**inputs)
         audio_2 = output.audios[0]
 
-        assert np.abs(audio_1 - audio_2).max() < 1e-4
+        assert np.abs(audio_1 - audio_2).max() < 1e-3
 
     def test_audioldm_negative_prompt_embeds(self):
         components = self.get_dummy_components()
@@ -241,107 +241,19 @@ def test_audioldm_negative_prompt_embeds(self):
         output = audioldm_pipe(**inputs)
         audio_2 = output.audios[0]
 
-        assert np.abs(audio_1 - audio_2).max() < 1e-4
+        assert np.abs(audio_1 - audio_2).max() < 1e-3
 
-    def test_audioldm_ddim_factor_8(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = audioldm_pipe(**inputs, height=136)  # width has to stay fixed for the vocoder
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 544
-
-        audio_slice = audio[-10:]
-        expected_slice = np.array([-0.0029, 0.0036, -0.0027, 0.0032, -0.0029, 0.0034, -0.0028, 0.0073, 0.0039, 0.0058])
-
-        assert np.abs(audio_slice - expected_slice).max() < 1e-3
-
-    def test_audioldm_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = audioldm_pipe(**inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 256
-
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
-        )
-
-        assert np.abs(audio_slice - expected_slice).max() < 1e-3
-
-    def test_audioldm_k_lms(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = audioldm_pipe(**inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 256
-
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
-        )
-
-        assert np.abs(audio_slice - expected_slice).max() < 1e-3
-
-    def test_audioldm_k_euler_ancestral(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = audioldm_pipe(**inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 256
-
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
-        )
-
-        assert np.abs(audio_slice - expected_slice).max() < 1e-3
-
-    def test_audioldm_k_euler(self):
+    def test_audioldm_negative_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
         components = self.get_dummy_components()
+        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
         audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe.scheduler = EulerDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe = audioldm_pipe.to(device)
         audioldm_pipe.set_progress_bar_config(disable=None)
 
         inputs = self.get_dummy_inputs(device)
-        output = audioldm_pipe(**inputs)
+        negative_prompt = "egg cracking"
+        output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
         audio = output.audios[0]
 
         assert audio.ndim == 1
@@ -362,43 +274,20 @@ def test_audioldm_vae_slicing(self):
         audioldm_pipe = audioldm_pipe.to(device)
         audioldm_pipe.set_progress_bar_config(disable=None)
 
-        image_count = 4
+        audio_count = 4
 
         inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * image_count
+        inputs["prompt"] = [inputs["prompt"]] * audio_count
         output_1 = audioldm_pipe(**inputs)
 
         # make sure sliced vae decode yields the same result
         audioldm_pipe.enable_vae_slicing()
         inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * image_count
+        inputs["prompt"] = [inputs["prompt"]] * audio_count
         output_2 = audioldm_pipe(**inputs)
 
         # there is a small discrepancy at spectrogram borders vs. full batch decode
-        assert np.abs(output_2.audios - output_1.audios).max() < 1e-4
-
-    def test_audioldm_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "egg cracking"
-        output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 256
-
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
-        )
-
-        assert np.abs(audio_slice - expected_slice).max() < 1e-3
+        assert np.abs(output_2.audios - output_1.audios).max() < 1e-3
 
     def test_audioldm_num_waveforms_per_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
@@ -435,48 +324,29 @@ def test_audioldm_num_waveforms_per_prompt(self):
 
         assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
 
-    def test_audioldm_long_prompt(self):
+    def test_audioldm_audio_length_in_s(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
         audioldm_pipe = AudioLDMPipeline(**components)
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
+        vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate
 
-        do_classifier_free_guidance = True
-        negative_prompt = None
-        num_images_per_prompt = 1
-        logger = logging.get_logger("diffusers.pipelines.audioldm.pipeline_audioldm")
-
-        prompt = 25 * "@"
-        with CaptureLogger(logger) as cap_logger_3:
-            text_embeddings_3 = audioldm_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        prompt = 100 * "@"
-        with CaptureLogger(logger) as cap_logger:
-            text_embeddings = audioldm_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        negative_prompt = "Hello"
-        with CaptureLogger(logger) as cap_logger_2:
-            text_embeddings_2 = audioldm_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
+        inputs = self.get_dummy_inputs(device)
+        output = audioldm_pipe(audio_length_in_s=0.016, **inputs)
+        audio = output.audios[0]
 
-        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
+        assert audio.ndim == 1
+        assert len(audio) / vocoder_sampling_rate == 0.016
 
-        assert text_embeddings.shape[1] == 32
+        output = audioldm_pipe(audio_length_in_s=0.032, **inputs)
+        audio = output.audios[0]
 
-        assert cap_logger.out == cap_logger_2.out
-        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
-        assert cap_logger.out.count("@") == 25
-        assert cap_logger_3.out == ""
+        assert audio.ndim == 1
+        assert len(audio) / vocoder_sampling_rate == 0.032
 
-    def test_audioldm_height_opt(self):
+    def test_audioldm_vocoder_model_in_dim(self):
         components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
         audioldm_pipe = AudioLDMPipeline(**components)
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
@@ -487,36 +357,10 @@ def test_audioldm_height_opt(self):
         audio_shape = output.audios.shape
         assert audio_shape == (1, 256)
 
-        output = audioldm_pipe(prompt, num_inference_steps=1, height=96, width=8)
-        audio_shape = output.audios.shape
-        assert audio_shape == (1, 384)
-
-        config = dict(audioldm_pipe.unet.config)
-        config["sample_size"] = 96
-        audioldm_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
-        output = audioldm_pipe(prompt, num_inference_steps=1, width=8)  # need to keep width fixed for vocoder
-        audio_shape = output.audios.shape
-        assert audio_shape == (1, 768)
-
-    def test_audioldm_width_opt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        prompt = ["hey"]
-
-        width = audioldm_pipe.vocoder.config.model_in_dim
-
-        output = audioldm_pipe(prompt, num_inference_steps=1, width=width)
-        audio_shape = output.audios.shape
-        assert audio_shape == (1, 256)
-
         config = audioldm_pipe.vocoder.config
-        config.model_in_dim = width * 2
+        config.model_in_dim *= 2
         audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
-        output = audioldm_pipe(prompt, num_inference_steps=1, width=width * 2)
+        output = audioldm_pipe(prompt, num_inference_steps=1)
         audio_shape = output.audios.shape
         # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
         assert audio_shape == (1, 256)

From 7bc812dd002fa501fc269db3e4ee4679824c5e25 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 15:00:56 +0100
Subject: [PATCH 54/66] fix: batch consistent tests

---
 .../pipelines/audioldm/pipeline_audioldm.py        |  8 +++++++-
 tests/pipeline_params.py                           | 14 ++++++++++++++
 tests/pipelines/audioldm/test_audioldm.py          |  3 +++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index cbeac727a6d9..f474b00879d3 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import inspect
-from typing import Callable, List, Optional, Union
+from typing import Callable, List, Optional, Union, Dict, Any
 
 import numpy as np
 import torch
@@ -406,6 +406,7 @@ def __call__(
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         output_type: Optional[str] = "np",
     ):
         r"""
@@ -458,6 +459,10 @@ def __call__(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
             output_type (`str`, *optional*, defaults to `"np"`):
                 The output format of the generate image. Choose between:
                 - `"np"`: Return Numpy `np.ndarray` objects.
@@ -547,6 +552,7 @@ def __call__(
                     t,
                     encoder_hidden_states=None,
                     class_labels=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
                 ).sample
 
                 # perform guidance
diff --git a/tests/pipeline_params.py b/tests/pipeline_params.py
index 2703801d4a7d..bc1b3c89e7a6 100644
--- a/tests/pipeline_params.py
+++ b/tests/pipeline_params.py
@@ -102,3 +102,17 @@
 UNCONDITIONAL_AUDIO_GENERATION_PARAMS = frozenset(["batch_size"])
 
 UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
+
+TEXT_TO_AUDIO_PARAMS = frozenset(
+    [
+        "prompt",
+        "audio_length_in_s",
+        "guidance_scale",
+        "negative_prompt",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "cross_attention_kwargs",
+    ]
+)
+
+TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 574eca54982e..d585a4055d0b 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -43,12 +43,15 @@
 )
 from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
+from ...pipeline_params import TEXT_TO_AUDIO_PARAMS, TEXT_TO_AUDIO_BATCH_PARAMS
 
 from ...test_pipelines_common import PipelineTesterMixin
 
 
 class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = AudioLDMPipeline
+    params = TEXT_TO_AUDIO_PARAMS
+    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
 
     def get_dummy_components(self):
         torch.manual_seed(0)

From f0002f1ce425f76c31437e12c4fed9eade577e25 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 15:01:14 +0100
Subject: [PATCH 55/66] clean-up: make style

---
 scripts/convert_original_audioldm_to_diffusers.py |  1 -
 .../pipelines/audioldm/pipeline_audioldm.py       |  2 +-
 tests/pipelines/audioldm/test_audioldm.py         | 15 ++++++---------
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 5e350bffde67..bd671e3a7b70 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -38,7 +38,6 @@
     PNDMScheduler,
     UNet2DConditionModel,
 )
-
 from diffusers.utils import is_omegaconf_available, is_safetensors_available
 from diffusers.utils.import_utils import BACKENDS_MAPPING
 
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index f474b00879d3..240513419ef1 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import inspect
-from typing import Callable, List, Optional, Union, Dict, Any
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
 import torch
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index d585a4055d0b..434605257457 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -15,7 +15,6 @@
 
 
 import gc
-import time
 import unittest
 
 import numpy as np
@@ -33,18 +32,14 @@
     AudioLDMPipeline,
     AutoencoderKL,
     DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
     UNet2DConditionModel,
-    logging,
 )
-from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
-from ...pipeline_params import TEXT_TO_AUDIO_PARAMS, TEXT_TO_AUDIO_BATCH_PARAMS
+from diffusers.utils import slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
 
+from ...pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
 from ...test_pipelines_common import PipelineTesterMixin
 
 
@@ -409,7 +404,9 @@ def test_audioldm(self):
         assert len(audio) == 81952
 
         audio_slice = audio[77230:77240]
-        expected_slice = np.array([-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315])
+        expected_slice = np.array(
+            [-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315]
+        )
         max_diff = np.abs(expected_slice - audio_slice).max()
         assert max_diff < 1e-2
 

From a0a156a43d08ba0ee2a5480a71284b96885c240b Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 15:02:06 +0100
Subject: [PATCH 56/66] clean-up: remove vae slicing fast test

---
 tests/pipelines/audioldm/test_audioldm.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 434605257457..323caf76c8e6 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -264,29 +264,6 @@ def test_audioldm_negative_prompt(self):
 
         assert np.abs(audio_slice - expected_slice).max() < 1e-3
 
-    def test_audioldm_vae_slicing(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        audio_count = 4
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * audio_count
-        output_1 = audioldm_pipe(**inputs)
-
-        # make sure sliced vae decode yields the same result
-        audioldm_pipe.enable_vae_slicing()
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * audio_count
-        output_2 = audioldm_pipe(**inputs)
-
-        # there is a small discrepancy at spectrogram borders vs. full batch decode
-        assert np.abs(output_2.audios - output_1.audios).max() < 1e-3
-
     def test_audioldm_num_waveforms_per_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()

From a01022ab3909b67935c54bc4d4da1682cdfb3037 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 15:13:06 +0100
Subject: [PATCH 57/66] clean-up: propagate changes to doc

---
 docs/source/en/api/pipelines/audioldm.mdx | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/api/pipelines/audioldm.mdx b/docs/source/en/api/pipelines/audioldm.mdx
index 4a3adeede05a..35347d54a53c 100644
--- a/docs/source/en/api/pipelines/audioldm.mdx
+++ b/docs/source/en/api/pipelines/audioldm.mdx
@@ -37,7 +37,7 @@ pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 
 prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
-audio = pipe(prompt, num_inference_steps=10, height=512).audios[0]
+audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
 
 # save the audio sample as a .wav file
 scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
@@ -45,10 +45,13 @@ scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
 
 ### Tips
 
-* Try to provide descriptive text inputs to AudioLDM. You can use adjectives to describe the sound (e.g. "high quality" or "clear") and make the prompt context specific (e.g., "water stream in a forest" instead of "stream").
+Prompts:
+* Descriptive prompt inputs work best: you can use adjectives to describe the sound (e.g. "high quality" or "clear") and make the prompt context specific (e.g., "water stream in a forest" instead of "stream").
 * It's best to use general terms like 'cat' or 'dog' instead of specific names or abstract objects that the model may not be familiar with.
+
+Inference:
 * The _quality_ of the predicted audio sample can be controlled by the `num_inference_steps` argument: higher steps give higher quality audio at the expense of slower inference.
-* The _length_ of the predicted audio sample can be controlled by varying the `height` argument (which controls the height of the spectrogram prediction).
+* The _length_ of the predicted audio sample can be controlled by varying the `audio_length_in_s` argument.
 
 ### How to load and use different schedulers
 

From 460231ed44d52336ace1b4b1e0b8036c165097ae Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 15:17:14 +0100
Subject: [PATCH 58/66] fix: increase test tol to 1e-2

---
 tests/pipelines/audioldm/test_audioldm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 323caf76c8e6..1cdbacf702ff 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -151,7 +151,7 @@ def test_audioldm_ddim(self):
             [-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033]
         )
 
-        assert np.abs(audio_slice - expected_slice).max() < 1e-3
+        assert np.abs(audio_slice - expected_slice).max() < 1e-2
 
     def test_audioldm_prompt_embeds(self):
         components = self.get_dummy_components()
@@ -192,7 +192,7 @@ def test_audioldm_prompt_embeds(self):
         output = audioldm_pipe(**inputs)
         audio_2 = output.audios[0]
 
-        assert np.abs(audio_1 - audio_2).max() < 1e-3
+        assert np.abs(audio_1 - audio_2).max() < 1e-2
 
     def test_audioldm_negative_prompt_embeds(self):
         components = self.get_dummy_components()
@@ -239,7 +239,7 @@ def test_audioldm_negative_prompt_embeds(self):
         output = audioldm_pipe(**inputs)
         audio_2 = output.audios[0]
 
-        assert np.abs(audio_1 - audio_2).max() < 1e-3
+        assert np.abs(audio_1 - audio_2).max() < 1e-2
 
     def test_audioldm_negative_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
@@ -262,7 +262,7 @@ def test_audioldm_negative_prompt(self):
             [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
         )
 
-        assert np.abs(audio_slice - expected_slice).max() < 1e-3
+        assert np.abs(audio_slice - expected_slice).max() < 1e-2
 
     def test_audioldm_num_waveforms_per_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator

From c8a7436716e1c7def3016bf5e5b6641d2758fa10 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 15:23:41 +0100
Subject: [PATCH 59/66] clean-up: finish docs

---
 docs/source/en/api/pipelines/audioldm.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/audioldm.mdx b/docs/source/en/api/pipelines/audioldm.mdx
index 35347d54a53c..f3987d2263ac 100644
--- a/docs/source/en/api/pipelines/audioldm.mdx
+++ b/docs/source/en/api/pipelines/audioldm.mdx
@@ -25,7 +25,7 @@ This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit
 
 ## Text-to-Audio
 
-- *Text-to-Audio* [cvssp/audioldm](https://huggingface.co/cvssp/audioldm) with [`AudioLDMPipeline`]
+The [`AudioLDMPipeline`] can be used to load pre-trained weights from [cvssp/audioldm](https://huggingface.co/cvssp/audioldm) and generate text-conditional audio outputs:
 
 ```python
 from diffusers import AudioLDMPipeline

From ee672775338ff43913eabd769e26b7d75fe0f5b9 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 17 Mar 2023 15:24:55 +0100
Subject: [PATCH 60/66] clean-up: make style

---
 src/diffusers/models/unet_2d_condition.py                       | 2 +-
 .../pipelines/versatile_diffusion/modeling_text_unet.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 4c21764e46c9..4577b35952f9 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -91,7 +91,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
-        class_embed_type (`str`, *optional*, defaults to None): 
+        class_embed_type (`str`, *optional*, defaults to None):
             The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
             `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
         num_class_embeds (`int`, *optional*, defaults to None):
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 137260928cdb..b85c688e83fa 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -172,7 +172,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlockFlat`]. Choose from `default` or `scale_shift`.
-        class_embed_type (`str`, *optional*, defaults to None): 
+        class_embed_type (`str`, *optional*, defaults to None):
             The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
             `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
         num_class_embeds (`int`, *optional*, defaults to None):

From d8ab1a197094f4fbbbbcf98148d69c7a94a15cb1 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 23 Mar 2023 11:22:53 +0100
Subject: [PATCH 61/66] feat: vocoder / VAE compatibility check

---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 240513419ef1..f57fde1da531 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -331,6 +331,13 @@ def check_inputs(
                 f"is {audio_length_in_s}."
             )
 
+        if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
+                f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
+                f"{self.vae_scale_factor}."
+            )
+
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):

From 56e3fb9e6385331b5d50ccafe85f4fc7b22dc6ab Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 23 Mar 2023 11:35:57 +0100
Subject: [PATCH 62/66] feat: possibly expand / cut audio waveform

---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index f57fde1da531..2086cb0c8a8d 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -490,6 +490,15 @@ def __call__(
 
         height = int(audio_length_in_s / vocoder_upsample_factor)
 
+        original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
+        if height % self.vae_scale_factor != 0:
+            height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
+            logger.info(
+                f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} "
+                f"so that it can be handled by the model. It will be cut to {audio_length_in_s} after the "
+                f"denoising process."
+            )
+
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt,
@@ -581,6 +590,8 @@ def __call__(
 
         audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
 
+        audio = audio[:, :original_waveform_length]
+
         if output_type == "np":
             audio = audio.numpy()
 

From e66dfc73bdd1d2037090957af9e9bab92922f5c2 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 23 Mar 2023 11:40:31 +0100
Subject: [PATCH 63/66] fix: pipeline call signature test

---
 tests/pipelines/audioldm/test_audioldm.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 1cdbacf702ff..3b1a9b7ea208 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -47,6 +47,18 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     pipeline_class = AudioLDMPipeline
     params = TEXT_TO_AUDIO_PARAMS
     batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "num_waveforms_per_prompt",
+            "generator",
+            "latents",
+            "output_type",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
 
     def get_dummy_components(self):
         torch.manual_seed(0)

From 7ed071a58569b99332587ec9b6eb116d06d81d1f Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 23 Mar 2023 11:45:06 +0100
Subject: [PATCH 64/66] fix: slow tests output len

---
 tests/pipelines/audioldm/test_audioldm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 3b1a9b7ea208..294b90e29411 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -360,7 +360,7 @@ def test_inference_batch_single_identical(self):
 
 
 @slow
-@require_torch_gpu
+#@require_torch_gpu
 class AudioLDMPipelineSlowTests(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
@@ -390,7 +390,7 @@ def test_audioldm(self):
         audio = audioldm_pipe(**inputs).audios[0]
 
         assert audio.ndim == 1
-        assert len(audio) == 81952
+        assert len(audio) == 81920
 
         audio_slice = audio[77230:77240]
         expected_slice = np.array(
@@ -409,7 +409,7 @@ def test_audioldm_lms(self):
         audio = audioldm_pipe(**inputs).audios[0]
 
         assert audio.ndim == 1
-        assert len(audio) == 81952
+        assert len(audio) == 81920
 
         audio_slice = audio[27780:27790]
         expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])

From b90d564d7b6640754f95ab4ae4a836a0f95fbc61 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 23 Mar 2023 12:39:18 +0100
Subject: [PATCH 65/66] clean-up: make style

---
 tests/pipelines/audioldm/test_audioldm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 294b90e29411..10de5440eb00 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -37,7 +37,6 @@
     UNet2DConditionModel,
 )
 from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
 
 from ...pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
 from ...test_pipelines_common import PipelineTesterMixin
@@ -360,7 +359,7 @@ def test_inference_batch_single_identical(self):
 
 
 @slow
-#@require_torch_gpu
+# @require_torch_gpu
 class AudioLDMPipelineSlowTests(unittest.TestCase):
     def tearDown(self):
         super().tearDown()

From ef6c8e061e51f8d57fef57c1e056fc3640ab1456 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 23 Mar 2023 13:43:03 +0000
Subject: [PATCH 66/66] make style

---
 tests/pipeline_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipeline_params.py b/tests/pipeline_params.py
index 653e2ef3310a..a0ac6c641c0b 100644
--- a/tests/pipeline_params.py
+++ b/tests/pipeline_params.py
@@ -118,4 +118,4 @@
 TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
 TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
 
-TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"])
\ No newline at end of file
+TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"])