From 7c0c9ac7e25f51ea71a1bbc8c77ccbf6b4c7f2ba Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:12:26 +0100
Subject: [PATCH 01/17] fix docs

---
 docs/source/en/api/pipelines/audioldm2.md     | 52 +++++--------------
 .../pipelines/audioldm2/pipeline_audioldm2.py | 44 +++++++++++-----
 2 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/docs/source/en/api/pipelines/audioldm2.md b/docs/source/en/api/pipelines/audioldm2.md
index f32ed6acdd80..9d055d4019b2 100644
--- a/docs/source/en/api/pipelines/audioldm2.md
+++ b/docs/source/en/api/pipelines/audioldm2.md
@@ -20,10 +20,10 @@ Inspired by [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelin
 is a text-to-audio _latent diffusion model (LDM)_ that learns continuous audio representations from text embeddings. Two 
 text encoder models are used to compute the text embeddings from a prompt input: the text-branch of [CLAP](https://huggingface.co/docs/transformers/main/en/model_doc/clap)
 and the encoder of [Flan-T5](https://huggingface.co/docs/transformers/main/en/model_doc/flan-t5). These text embeddings 
-are then projected to a shared embedding space by an [AudioLDM2ProjectionModel](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2/AudioLDM2ProjectionModel). 
+are then projected to a shared embedding space by an [AudioLDM2ProjectionModel](https://huggingface.co/docs/diffusers/main/api/pipelines/audioldm2#diffusers.AudioLDM2ProjectionModel). 
 A [GPT2](https://huggingface.co/docs/transformers/main/en/model_doc/gpt2) _language model (LM)_ is used to auto-regressively 
 predict eight new embedding vectors, conditional on the projected CLAP and Flan-T5 embeddings. The generated embedding 
-vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The [UNet](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2/AudioLDM2UNet2DConditionModel) 
+vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The [UNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2UNet2DConditionModel) 
 of AudioLDM 2 is unique in the sense that it takes **two** cross-attention embeddings, as opposed to one cross-attention 
 conditioning, as in most other LDMs.
 
@@ -38,13 +38,17 @@ found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2).
 
 ### Choosing a checkpoint
 
-AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio generation. The third checkpoint is trained exclusively on text-to-music generation. See table below for details on the three official checkpoints:
+AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio 
+generation. The third checkpoint is trained exclusively on text-to-music generation.
 
-| Checkpoint                                                      | Task          | Model Size | Training Data / h |
-|-----------------------------------------------------------------|---------------|------------|-------------------|
-| [audioldm2](https://huggingface.co/cvssp/audioldm2)             | Text-to-audio | 1.1B       | 1150k             |
-| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 1.1B       | 665k              |
-| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 1.5B       | 1150k             |
+All checkpoints share the same model size for the text encoders and VAE. They differ in the size and depth of the UNet. 
+See table below for details on the three checkpoints:
+
+| Checkpoint                                                      | Task          | UNet Model Size | Total Model Size | Training Data / h |
+|-----------------------------------------------------------------|---------------|-----------------|------------------|-------------------|
+| [audioldm2](https://huggingface.co/cvssp/audioldm2)             | Text-to-audio | 350M            | 1.1B             | 1150k             |
+| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M            | 1.5B             | 1150k             |
+| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M            | 1.1B             | 665k              |
 
 ### Constructing a prompt
 
@@ -62,37 +66,7 @@ AudioLDM2 comes in three variants. Two of these checkpoints are applicable to th
 * The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation
 * Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.
 
-The following example demonstrates how to construct good music generation using the aforementioned tips: 
-
-```python
-import scipy
-import torch
-from diffusers import AudioLDM2Pipeline
-
-# load the best weights for music generation
-repo_id = "cvssp/audioldm2-music"
-pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
-pipe = pipe.to("cuda")
-
-# define the prompts
-prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
-negative_prompt = "Low quality."
-
-# set the seed
-generator = torch.Generator("cuda").manual_seed(0)
-
-# run the generation
-audio = pipe(
-    prompt,
-    negative_prompt=negative_prompt,
-    num_inference_steps=200,
-    audio_length_in_s=10.0,
-    num_waveforms_per_prompt=3,
-).audios
-
-# save the best audio sample (index 0) as a .wav file
-scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
-```
+The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
 
 <Tip>
 
diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 5f1cd73dd448..bcf6cce10509 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -51,19 +51,33 @@
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
-        >>> from diffusers import AudioLDM2Pipeline
-        >>> import torch
-        >>> import scipy
-
-        >>> repo_id = "cvssp/audioldm2"
-        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
-        >>> pipe = pipe.to("cuda")
-
-        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
-        >>> audio = pipe(prompt, num_inference_steps=200, audio_length_in_s=10.0).audios[0]
-
-        >>> # save the audio sample as a .wav file
-        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
+            >>> import scipy
+            >>> import torch
+            >>> from diffusers import AudioLDM2Pipeline
+
+            >>> # load the best weights for music generation
+            >>> repo_id = "cvssp/audioldm2-music"
+            >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+            >>> pipe = pipe.to("cuda")
+
+            >>> # define the prompts
+            >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+            >>> negative_prompt = "Low quality."
+
+            >>> # set the seed
+            >>> generator = torch.Generator("cuda").manual_seed(0)
+
+            >>> # run the generation
+            >>> audio = pipe(
+            >>>     prompt,
+            >>>     negative_prompt=negative_prompt,
+            >>>     num_inference_steps=200,
+            >>>     audio_length_in_s=10.0,
+            >>>     num_waveforms_per_prompt=3,
+            >>> ).audios
+
+            >>> # save the best audio sample (index 0) as a .wav file
+            >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
         ```
 """
 
@@ -315,6 +329,7 @@ def encode_prompt(
         Example:
 
         ```python
+        >>> import scipy
         >>> import torch
         >>> from diffusers import AudioLDM2Pipeline
 
@@ -337,6 +352,9 @@ def encode_prompt(
         ...     num_inference_steps=200,
         ...     audio_length_in_s=10.0,
         ... ).audios[0]
+
+        >>> # save generated audio sample
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
         ```"""
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1

From c52100bc47a006bd54f3d249a47e1036d1e74deb Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:13:17 +0100
Subject: [PATCH 02/17] fix unet docs

---
 src/diffusers/pipelines/audioldm2/modeling_audioldm2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
index 37b9d9c2d1fb..27295054a680 100644
--- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
@@ -659,7 +659,7 @@ def forward(
         encoder_attention_mask_1: Optional[torch.Tensor] = None,
     ) -> Union[UNet2DConditionOutput, Tuple]:
         r"""
-        The [`UNet2DConditionModel`] forward method.
+        The [`AudioLDM2UNet2DConditionModel`] forward method.
 
         Args:
             sample (`torch.FloatTensor`):

From cf787760e37e96a9b63b3b8be6fee63e490f43e5 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:14:13 +0100
Subject: [PATCH 03/17] use image output for latents

---
 src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index bcf6cce10509..7450b103912b 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -39,7 +39,7 @@
     randn_tensor,
     replace_example_docstring,
 )
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput
 from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
 
 
@@ -948,7 +948,7 @@ def __call__(
             latents = 1 / self.vae.config.scaling_factor * latents
             mel_spectrogram = self.vae.decode(latents).sample
         else:
-            return AudioPipelineOutput(audios=latents)
+            return ImagePipelineOutput(images=latents)
 
         audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
 

From 7ff4b46ea4b0d43643e133f1a50e4e1c9d03a4f8 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:14:42 +0100
Subject: [PATCH 04/17] fix hub checkpoints

---
 tests/pipelines/audioldm2/test_audioldm2.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
index 7abcfbfeb3b9..2b85c5e8a763 100644
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -514,7 +514,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
         return inputs
 
     def test_audioldm2(self):
-        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("/home/sanchit/convert-audioldm2/hub-audioldm2")
+        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
 
@@ -532,7 +532,7 @@ def test_audioldm2(self):
         assert max_diff < 1e-3
 
     def test_audioldm2_lms(self):
-        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("/home/sanchit/convert-audioldm2/hub-audioldm2")
+        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
         audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
@@ -552,7 +552,7 @@ def test_audioldm2_lms(self):
         assert max_diff < 1e-3
 
     def test_audioldm2_large(self):
-        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("/home/sanchit/convert-audioldm2/hub-audioldm2-large")
+        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large")
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
 

From 6f90747ec2886789c1dacf71ca5633037260494d Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:18:49 +0100
Subject: [PATCH 05/17] fix pipeline example

---
 .../pipelines/audioldm2/pipeline_audioldm2.py | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 7450b103912b..25b167ad6f2f 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -51,33 +51,33 @@
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
-            >>> import scipy
-            >>> import torch
-            >>> from diffusers import AudioLDM2Pipeline
-
-            >>> # load the best weights for music generation
-            >>> repo_id = "cvssp/audioldm2-music"
-            >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
-            >>> pipe = pipe.to("cuda")
-
-            >>> # define the prompts
-            >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
-            >>> negative_prompt = "Low quality."
-
-            >>> # set the seed
-            >>> generator = torch.Generator("cuda").manual_seed(0)
-
-            >>> # run the generation
-            >>> audio = pipe(
-            >>>     prompt,
-            >>>     negative_prompt=negative_prompt,
-            >>>     num_inference_steps=200,
-            >>>     audio_length_in_s=10.0,
-            >>>     num_waveforms_per_prompt=3,
-            >>> ).audios
-
-            >>> # save the best audio sample (index 0) as a .wav file
-            >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
+        >>> import scipy
+        >>> import torch
+        >>> from diffusers import AudioLDM2Pipeline
+
+        >>> # load the best weights for music generation
+        >>> repo_id = "cvssp/audioldm2-music"
+        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # define the prompts
+        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+        >>> negative_prompt = "Low quality."
+
+        >>> # set the seed
+        >>> generator = torch.Generator("cuda").manual_seed(0)
+
+        >>> # run the generation
+        >>> audio = pipe(
+        >>>     prompt,
+        >>>     negative_prompt=negative_prompt,
+        >>>     num_inference_steps=200,
+        >>>     audio_length_in_s=10.0,
+        >>>     num_waveforms_per_prompt=3,
+        >>> ).audios
+
+        >>> # save the best audio sample (index 0) as a .wav file
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
         ```
 """
 

From a0f6ac5088f9559f700467ae3c9236649984da02 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:22:06 +0100
Subject: [PATCH 06/17] update example

---
 src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 25b167ad6f2f..0eece6e59bc6 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -55,13 +55,12 @@
         >>> import torch
         >>> from diffusers import AudioLDM2Pipeline
 
-        >>> # load the best weights for music generation
-        >>> repo_id = "cvssp/audioldm2-music"
+        >>> repo_id = "cvssp/audioldm2"
         >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
         >>> pipe = pipe.to("cuda")
 
         >>> # define the prompts
-        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+        >>> prompt = "The sound of a hammer hitting a wooden surface."
         >>> negative_prompt = "Low quality."
 
         >>> # set the seed

From addb98ef8b7e397d9d6b65b595d292d544dd6ea4 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:23:22 +0100
Subject: [PATCH 07/17] return_dict = False

---
 src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 0eece6e59bc6..4393d3cc6d30 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -926,7 +926,8 @@ def __call__(
                     encoder_hidden_states=generated_prompt_embeds,
                     encoder_hidden_states_1=prompt_embeds,
                     encoder_attention_mask_1=attention_mask,
-                ).sample
+                    return_dict=False,
+                )[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:

From e1504501d3bc308f624182afcb255cb709517629 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:28:20 +0100
Subject: [PATCH 08/17] revert image pipeline output

---
 src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 4393d3cc6d30..3d11014ae1f9 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -39,7 +39,7 @@
     randn_tensor,
     replace_example_docstring,
 )
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
 
 
@@ -948,7 +948,7 @@ def __call__(
             latents = 1 / self.vae.config.scaling_factor * latents
             mel_spectrogram = self.vae.decode(latents).sample
         else:
-            return ImagePipelineOutput(images=latents)
+            return AudioPipelineOutput(audios=latents)
 
         audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
 

From b2964316052d39451a78870366217e68e47f34fe Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:33:45 +0100
Subject: [PATCH 09/17] revert doc changes

---
 docs/source/en/api/pipelines/audioldm2.md | 52 +++++++++++++++++------
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/docs/source/en/api/pipelines/audioldm2.md b/docs/source/en/api/pipelines/audioldm2.md
index 9d055d4019b2..f32ed6acdd80 100644
--- a/docs/source/en/api/pipelines/audioldm2.md
+++ b/docs/source/en/api/pipelines/audioldm2.md
@@ -20,10 +20,10 @@ Inspired by [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelin
 is a text-to-audio _latent diffusion model (LDM)_ that learns continuous audio representations from text embeddings. Two 
 text encoder models are used to compute the text embeddings from a prompt input: the text-branch of [CLAP](https://huggingface.co/docs/transformers/main/en/model_doc/clap)
 and the encoder of [Flan-T5](https://huggingface.co/docs/transformers/main/en/model_doc/flan-t5). These text embeddings 
-are then projected to a shared embedding space by an [AudioLDM2ProjectionModel](https://huggingface.co/docs/diffusers/main/api/pipelines/audioldm2#diffusers.AudioLDM2ProjectionModel). 
+are then projected to a shared embedding space by an [AudioLDM2ProjectionModel](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2/AudioLDM2ProjectionModel). 
 A [GPT2](https://huggingface.co/docs/transformers/main/en/model_doc/gpt2) _language model (LM)_ is used to auto-regressively 
 predict eight new embedding vectors, conditional on the projected CLAP and Flan-T5 embeddings. The generated embedding 
-vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The [UNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2UNet2DConditionModel) 
+vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The [UNet](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2/AudioLDM2UNet2DConditionModel) 
 of AudioLDM 2 is unique in the sense that it takes **two** cross-attention embeddings, as opposed to one cross-attention 
 conditioning, as in most other LDMs.
 
@@ -38,17 +38,13 @@ found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2).
 
 ### Choosing a checkpoint
 
-AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio 
-generation. The third checkpoint is trained exclusively on text-to-music generation.
+AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio generation. The third checkpoint is trained exclusively on text-to-music generation. See table below for details on the three official checkpoints:
 
-All checkpoints share the same model size for the text encoders and VAE. They differ in the size and depth of the UNet. 
-See table below for details on the three checkpoints:
-
-| Checkpoint                                                      | Task          | UNet Model Size | Total Model Size | Training Data / h |
-|-----------------------------------------------------------------|---------------|-----------------|------------------|-------------------|
-| [audioldm2](https://huggingface.co/cvssp/audioldm2)             | Text-to-audio | 350M            | 1.1B             | 1150k             |
-| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M            | 1.5B             | 1150k             |
-| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M            | 1.1B             | 665k              |
+| Checkpoint                                                      | Task          | Model Size | Training Data / h |
+|-----------------------------------------------------------------|---------------|------------|-------------------|
+| [audioldm2](https://huggingface.co/cvssp/audioldm2)             | Text-to-audio | 1.1B       | 1150k             |
+| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 1.1B       | 665k              |
+| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 1.5B       | 1150k             |
 
 ### Constructing a prompt
 
@@ -66,7 +62,37 @@ See table below for details on the three checkpoints:
 * The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation
 * Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.
 
-The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example).
+The following example demonstrates how to construct good music generation using the aforementioned tips: 
+
+```python
+import scipy
+import torch
+from diffusers import AudioLDM2Pipeline
+
+# load the best weights for music generation
+repo_id = "cvssp/audioldm2-music"
+pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+pipe = pipe.to("cuda")
+
+# define the prompts
+prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+negative_prompt = "Low quality."
+
+# set the seed
+generator = torch.Generator("cuda").manual_seed(0)
+
+# run the generation
+audio = pipe(
+    prompt,
+    negative_prompt=negative_prompt,
+    num_inference_steps=200,
+    audio_length_in_s=10.0,
+    num_waveforms_per_prompt=3,
+).audios
+
+# save the best audio sample (index 0) as a .wav file
+scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
+```
 
 <Tip>
 

From 1c84471f6f1cb545b59b299d8376c7b0e656b8c8 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 12:45:10 +0100
Subject: [PATCH 10/17] remove dtype test

---
 tests/pipelines/audioldm2/test_audioldm2.py | 23 ---------------------
 1 file changed, 23 deletions(-)

diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
index 2b85c5e8a763..39d8a12434d1 100644
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -469,29 +469,6 @@ def test_save_load_optional_components(self):
         # increase tolerance from 1e-4 -> 2e-4 to account for large composite model
         super().test_save_load_optional_components(expected_max_difference=2e-4)
 
-    def test_to_dtype(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        # The method component.dtype returns the dtype of the first parameter registered in the model, not the
-        # dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale)
-        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
-        self.assertTrue(model_dtypes["text_encoder"] == torch.float64)
-
-        # Without the logit scale parameters, everything is float32
-        model_dtypes.pop("text_encoder")
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
-
-        # the CLAP sub-models are float32
-        model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
-
-        # Once we send to fp16, all params are in half-precision, including the logit scale
-        pipe.to(torch_dtype=torch.float16)
-        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
-        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
-
 
 @slow
 class AudioLDM2PipelineSlowTests(unittest.TestCase):

From 5d420c9c8a5cf1ea7cf5ef590ce017dd26ff9798 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 13:24:16 +0100
Subject: [PATCH 11/17] make style

---
 .../pipelines/audioldm2/pipeline_audioldm2.py        | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 3d11014ae1f9..4f2ec5ff2dc3 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -68,12 +68,12 @@
 
         >>> # run the generation
         >>> audio = pipe(
-        >>>     prompt,
-        >>>     negative_prompt=negative_prompt,
-        >>>     num_inference_steps=200,
-        >>>     audio_length_in_s=10.0,
-        >>>     num_waveforms_per_prompt=3,
-        >>> ).audios
+        ...     prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     num_inference_steps=200,
+        ...     audio_length_in_s=10.0,
+        ...     num_waveforms_per_prompt=3,
+        ... ).audios
 
         >>> # save the best audio sample (index 0) as a .wav file
         >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])

From d0e22e4520d5d510e1ab4178251c108f692bc150 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 14:16:41 +0100
Subject: [PATCH 12/17] remove docstring updates

---
 .../pipelines/audioldm2/pipeline_audioldm2.py | 29 ++++---------------
 1 file changed, 6 insertions(+), 23 deletions(-)

diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 4f2ec5ff2dc3..ed11d9780d28 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -51,32 +51,19 @@
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
-        >>> import scipy
-        >>> import torch
         >>> from diffusers import AudioLDM2Pipeline
+        >>> import torch
+        >>> import scipy
 
         >>> repo_id = "cvssp/audioldm2"
         >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
         >>> pipe = pipe.to("cuda")
 
-        >>> # define the prompts
-        >>> prompt = "The sound of a hammer hitting a wooden surface."
-        >>> negative_prompt = "Low quality."
+        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+        >>> audio = pipe(prompt, num_inference_steps=200, audio_length_in_s=10.0).audios[0]
 
-        >>> # set the seed
-        >>> generator = torch.Generator("cuda").manual_seed(0)
-
-        >>> # run the generation
-        >>> audio = pipe(
-        ...     prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     num_inference_steps=200,
-        ...     audio_length_in_s=10.0,
-        ...     num_waveforms_per_prompt=3,
-        ... ).audios
-
-        >>> # save the best audio sample (index 0) as a .wav file
-        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
+        >>> # save the audio sample as a .wav file
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
         ```
 """
 
@@ -328,7 +315,6 @@ def encode_prompt(
         Example:
 
         ```python
-        >>> import scipy
         >>> import torch
         >>> from diffusers import AudioLDM2Pipeline
 
@@ -351,9 +337,6 @@ def encode_prompt(
         ...     num_inference_steps=200,
         ...     audio_length_in_s=10.0,
         ... ).audios[0]
-
-        >>> # save generated audio sample
-        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
         ```"""
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1

From 4efa070797b81fec2ae8900bf2f3ce6d1777cc73 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Wed, 23 Aug 2023 14:18:04 +0100
Subject: [PATCH 13/17] remove unet docstring update

---
 src/diffusers/pipelines/audioldm2/modeling_audioldm2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
index 27295054a680..37b9d9c2d1fb 100644
--- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
@@ -659,7 +659,7 @@ def forward(
         encoder_attention_mask_1: Optional[torch.Tensor] = None,
     ) -> Union[UNet2DConditionOutput, Tuple]:
         r"""
-        The [`AudioLDM2UNet2DConditionModel`] forward method.
+        The [`UNet2DConditionModel`] forward method.
 
         Args:
             sample (`torch.FloatTensor`):

From 0fb7aad4bdf4d8cdc758b8af51f2658b4f1a77e6 Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Thu, 24 Aug 2023 15:12:37 +0100
Subject: [PATCH 14/17] Empty commit to re-trigger CI


From 76a52b521a7b0dd668b653d14472091194db9d4a Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 25 Aug 2023 09:24:49 +0100
Subject: [PATCH 15/17] fix cpu offload

---
 src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index ed11d9780d28..7c7dac545c5c 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -194,12 +194,15 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         model_sequence = [
-            self.text_encoder,
+            self.text_encoder.text_model,
+            self.text_encoder.text_projection,
             self.text_encoder_2,
             self.projection_model,
             self.language_model,
             self.unet,
             self.vae,
+            self.vocoder,
+            self.text_encoder,
         ]
 
         hook = None

From f4b74995d1535f1f0f8762f46b3dacef2cfe0f6d Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 25 Aug 2023 10:07:07 +0100
Subject: [PATCH 16/17] fix dtype test

---
 tests/pipelines/audioldm2/test_audioldm2.py | 24 ++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
index 39d8a12434d1..fdfcd3065e50 100644
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -44,7 +44,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from diffusers.utils import is_xformers_available, slow, torch_device
+from diffusers.utils import is_accelerate_available, is_accelerate_version, is_xformers_available, slow, torch_device
 from diffusers.utils.testing_utils import enable_full_determinism
 
 from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
@@ -469,6 +469,28 @@ def test_save_load_optional_components(self):
         # increase tolerance from 1e-4 -> 2e-4 to account for large composite model
         super().test_save_load_optional_components(expected_max_difference=2e-4)
 
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # The method component.dtype returns the dtype of the first parameter registered in the model, not the
+        # dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale)
+        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
+
+        # Without the logit scale parameters, everything is float32
+        model_dtypes.pop("text_encoder")
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
+
+        # the CLAP sub-models are float32
+        model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
+
+        # Once we send to fp16, all params are in half-precision, including the logit scale
+        pipe.to(torch_dtype=torch.float16)
+        model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
+
 
 @slow
 class AudioLDM2PipelineSlowTests(unittest.TestCase):

From 6def17af21d40ad612f330e24675eb00f0f69d5c Mon Sep 17 00:00:00 2001
From: sanchit-gandhi <sanchit@huggingface.co>
Date: Fri, 25 Aug 2023 10:07:23 +0100
Subject: [PATCH 17/17] add offload test

---
 tests/pipelines/audioldm2/test_audioldm2.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
index fdfcd3065e50..942aec70d7cb 100644
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -491,6 +491,26 @@ def test_to_dtype(self):
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
         self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
 
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
+    )
+    def test_model_cpu_offload(self, expected_max_diff=2e-4):
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDM2Pipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = audioldm_pipe(**inputs)[0]
+
+        audioldm_pipe.enable_model_cpu_offload()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = audioldm_pipe(**inputs)[0]
+
+        max_diff = np.abs(output_with_offload - output_without_offload).max()
+        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
+
 
 @slow
 class AudioLDM2PipelineSlowTests(unittest.TestCase):