From 7c0c9ac7e25f51ea71a1bbc8c77ccbf6b4c7f2ba Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:12:26 +0100 Subject: [PATCH 01/17] fix docs --- docs/source/en/api/pipelines/audioldm2.md | 52 +++++-------------- .../pipelines/audioldm2/pipeline_audioldm2.py | 44 +++++++++++----- 2 files changed, 44 insertions(+), 52 deletions(-) diff --git a/docs/source/en/api/pipelines/audioldm2.md b/docs/source/en/api/pipelines/audioldm2.md index f32ed6acdd80..9d055d4019b2 100644 --- a/docs/source/en/api/pipelines/audioldm2.md +++ b/docs/source/en/api/pipelines/audioldm2.md @@ -20,10 +20,10 @@ Inspired by [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelin is a text-to-audio _latent diffusion model (LDM)_ that learns continuous audio representations from text embeddings. Two text encoder models are used to compute the text embeddings from a prompt input: the text-branch of [CLAP](https://huggingface.co/docs/transformers/main/en/model_doc/clap) and the encoder of [Flan-T5](https://huggingface.co/docs/transformers/main/en/model_doc/flan-t5). These text embeddings -are then projected to a shared embedding space by an [AudioLDM2ProjectionModel](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2/AudioLDM2ProjectionModel). +are then projected to a shared embedding space by an [AudioLDM2ProjectionModel](https://huggingface.co/docs/diffusers/main/api/pipelines/audioldm2#diffusers.AudioLDM2ProjectionModel). A [GPT2](https://huggingface.co/docs/transformers/main/en/model_doc/gpt2) _language model (LM)_ is used to auto-regressively predict eight new embedding vectors, conditional on the projected CLAP and Flan-T5 embeddings. The generated embedding -vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The [UNet](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2/AudioLDM2UNet2DConditionModel) +vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The [UNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2UNet2DConditionModel) of AudioLDM 2 is unique in the sense that it takes **two** cross-attention embeddings, as opposed to one cross-attention conditioning, as in most other LDMs. @@ -38,13 +38,17 @@ found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2). ### Choosing a checkpoint -AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio generation. The third checkpoint is trained exclusively on text-to-music generation. See table below for details on the three official checkpoints: +AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio +generation. The third checkpoint is trained exclusively on text-to-music generation. -| Checkpoint | Task | Model Size | Training Data / h | -|-----------------------------------------------------------------|---------------|------------|-------------------| -| [audioldm2](https://huggingface.co/cvssp/audioldm2) | Text-to-audio | 1.1B | 1150k | -| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 1.1B | 665k | -| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 1.5B | 1150k | +All checkpoints share the same model size for the text encoders and VAE. They differ in the size and depth of the UNet. +See table below for details on the three checkpoints: + +| Checkpoint | Task | UNet Model Size | Total Model Size | Training Data / h | +|-----------------------------------------------------------------|---------------|-----------------|------------------|-------------------| +| [audioldm2](https://huggingface.co/cvssp/audioldm2) | Text-to-audio | 350M | 1.1B | 1150k | +| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M | 1.5B | 1150k | +| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M | 1.1B | 665k | ### Constructing a prompt @@ -62,37 +66,7 @@ AudioLDM2 comes in three variants. Two of these checkpoints are applicable to th * The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation * Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly. -The following example demonstrates how to construct good music generation using the aforementioned tips: - -```python -import scipy -import torch -from diffusers import AudioLDM2Pipeline - -# load the best weights for music generation -repo_id = "cvssp/audioldm2-music" -pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16) -pipe = pipe.to("cuda") - -# define the prompts -prompt = "Techno music with a strong, upbeat tempo and high melodic riffs" -negative_prompt = "Low quality." - -# set the seed -generator = torch.Generator("cuda").manual_seed(0) - -# run the generation -audio = pipe( - prompt, - negative_prompt=negative_prompt, - num_inference_steps=200, - audio_length_in_s=10.0, - num_waveforms_per_prompt=3, -).audios - -# save the best audio sample (index 0) as a .wav file -scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0]) -``` +The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example). diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 5f1cd73dd448..bcf6cce10509 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -51,19 +51,33 @@ EXAMPLE_DOC_STRING = """ Examples: ```py - >>> from diffusers import AudioLDM2Pipeline - >>> import torch - >>> import scipy - - >>> repo_id = "cvssp/audioldm2" - >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16) - >>> pipe = pipe.to("cuda") - - >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs" - >>> audio = pipe(prompt, num_inference_steps=200, audio_length_in_s=10.0).audios[0] - - >>> # save the audio sample as a .wav file - >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio) + >>> import scipy + >>> import torch + >>> from diffusers import AudioLDM2Pipeline + + >>> # load the best weights for music generation + >>> repo_id = "cvssp/audioldm2-music" + >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16) + >>> pipe = pipe.to("cuda") + + >>> # define the prompts + >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs" + >>> negative_prompt = "Low quality." + + >>> # set the seed + >>> generator = torch.Generator("cuda").manual_seed(0) + + >>> # run the generation + >>> audio = pipe( + >>> prompt, + >>> negative_prompt=negative_prompt, + >>> num_inference_steps=200, + >>> audio_length_in_s=10.0, + >>> num_waveforms_per_prompt=3, + >>> ).audios + + >>> # save the best audio sample (index 0) as a .wav file + >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0]) ``` """ @@ -315,6 +329,7 @@ def encode_prompt( Example: ```python + >>> import scipy >>> import torch >>> from diffusers import AudioLDM2Pipeline @@ -337,6 +352,9 @@ def encode_prompt( ... num_inference_steps=200, ... audio_length_in_s=10.0, ... ).audios[0] + + >>> # save generated audio sample + >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio) ```""" if prompt is not None and isinstance(prompt, str): batch_size = 1 From c52100bc47a006bd54f3d249a47e1036d1e74deb Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:13:17 +0100 Subject: [PATCH 02/17] fix unet docs --- src/diffusers/pipelines/audioldm2/modeling_audioldm2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index 37b9d9c2d1fb..27295054a680 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -659,7 +659,7 @@ def forward( encoder_attention_mask_1: Optional[torch.Tensor] = None, ) -> Union[UNet2DConditionOutput, Tuple]: r""" - The [`UNet2DConditionModel`] forward method. + The [`AudioLDM2UNet2DConditionModel`] forward method. Args: sample (`torch.FloatTensor`): From cf787760e37e96a9b63b3b8be6fee63e490f43e5 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:14:13 +0100 Subject: [PATCH 03/17] use image output for latents --- src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index bcf6cce10509..7450b103912b 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -39,7 +39,7 @@ randn_tensor, replace_example_docstring, ) -from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline +from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel @@ -948,7 +948,7 @@ def __call__( latents = 1 / self.vae.config.scaling_factor * latents mel_spectrogram = self.vae.decode(latents).sample else: - return AudioPipelineOutput(audios=latents) + return ImagePipelineOutput(images=latents) audio = self.mel_spectrogram_to_waveform(mel_spectrogram) From 7ff4b46ea4b0d43643e133f1a50e4e1c9d03a4f8 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:14:42 +0100 Subject: [PATCH 04/17] fix hub checkpoints --- tests/pipelines/audioldm2/test_audioldm2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py index 7abcfbfeb3b9..2b85c5e8a763 100644 --- a/tests/pipelines/audioldm2/test_audioldm2.py +++ b/tests/pipelines/audioldm2/test_audioldm2.py @@ -514,7 +514,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0 return inputs def test_audioldm2(self): - audioldm_pipe = AudioLDM2Pipeline.from_pretrained("/home/sanchit/convert-audioldm2/hub-audioldm2") + audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2") audioldm_pipe = audioldm_pipe.to(torch_device) audioldm_pipe.set_progress_bar_config(disable=None) @@ -532,7 +532,7 @@ def test_audioldm2(self): assert max_diff < 1e-3 def test_audioldm2_lms(self): - audioldm_pipe = AudioLDM2Pipeline.from_pretrained("/home/sanchit/convert-audioldm2/hub-audioldm2") + audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2") audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config) audioldm_pipe = audioldm_pipe.to(torch_device) audioldm_pipe.set_progress_bar_config(disable=None) @@ -552,7 +552,7 @@ def test_audioldm2_lms(self): assert max_diff < 1e-3 def test_audioldm2_large(self): - audioldm_pipe = AudioLDM2Pipeline.from_pretrained("/home/sanchit/convert-audioldm2/hub-audioldm2-large") + audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large") audioldm_pipe = audioldm_pipe.to(torch_device) audioldm_pipe.set_progress_bar_config(disable=None) From 6f90747ec2886789c1dacf71ca5633037260494d Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:18:49 +0100 Subject: [PATCH 05/17] fix pipeline example --- .../pipelines/audioldm2/pipeline_audioldm2.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 7450b103912b..25b167ad6f2f 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -51,33 +51,33 @@ EXAMPLE_DOC_STRING = """ Examples: ```py - >>> import scipy - >>> import torch - >>> from diffusers import AudioLDM2Pipeline - - >>> # load the best weights for music generation - >>> repo_id = "cvssp/audioldm2-music" - >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16) - >>> pipe = pipe.to("cuda") - - >>> # define the prompts - >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs" - >>> negative_prompt = "Low quality." - - >>> # set the seed - >>> generator = torch.Generator("cuda").manual_seed(0) - - >>> # run the generation - >>> audio = pipe( - >>> prompt, - >>> negative_prompt=negative_prompt, - >>> num_inference_steps=200, - >>> audio_length_in_s=10.0, - >>> num_waveforms_per_prompt=3, - >>> ).audios - - >>> # save the best audio sample (index 0) as a .wav file - >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0]) + >>> import scipy + >>> import torch + >>> from diffusers import AudioLDM2Pipeline + + >>> # load the best weights for music generation + >>> repo_id = "cvssp/audioldm2-music" + >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16) + >>> pipe = pipe.to("cuda") + + >>> # define the prompts + >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs" + >>> negative_prompt = "Low quality." + + >>> # set the seed + >>> generator = torch.Generator("cuda").manual_seed(0) + + >>> # run the generation + >>> audio = pipe( + >>> prompt, + >>> negative_prompt=negative_prompt, + >>> num_inference_steps=200, + >>> audio_length_in_s=10.0, + >>> num_waveforms_per_prompt=3, + >>> ).audios + + >>> # save the best audio sample (index 0) as a .wav file + >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0]) ``` """ From a0f6ac5088f9559f700467ae3c9236649984da02 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:22:06 +0100 Subject: [PATCH 06/17] update example --- src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 25b167ad6f2f..0eece6e59bc6 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -55,13 +55,12 @@ >>> import torch >>> from diffusers import AudioLDM2Pipeline - >>> # load the best weights for music generation - >>> repo_id = "cvssp/audioldm2-music" + >>> repo_id = "cvssp/audioldm2" >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16) >>> pipe = pipe.to("cuda") >>> # define the prompts - >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs" + >>> prompt = "The sound of a hammer hitting a wooden surface." >>> negative_prompt = "Low quality." >>> # set the seed From addb98ef8b7e397d9d6b65b595d292d544dd6ea4 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:23:22 +0100 Subject: [PATCH 07/17] return_dict = False --- src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 0eece6e59bc6..4393d3cc6d30 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -926,7 +926,8 @@ def __call__( encoder_hidden_states=generated_prompt_embeds, encoder_hidden_states_1=prompt_embeds, encoder_attention_mask_1=attention_mask, - ).sample + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: From e1504501d3bc308f624182afcb255cb709517629 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:28:20 +0100 Subject: [PATCH 08/17] revert image pipeline output --- src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 4393d3cc6d30..3d11014ae1f9 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -39,7 +39,7 @@ randn_tensor, replace_example_docstring, ) -from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput +from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel @@ -948,7 +948,7 @@ def __call__( latents = 1 / self.vae.config.scaling_factor * latents mel_spectrogram = self.vae.decode(latents).sample else: - return ImagePipelineOutput(images=latents) + return AudioPipelineOutput(audios=latents) audio = self.mel_spectrogram_to_waveform(mel_spectrogram) From b2964316052d39451a78870366217e68e47f34fe Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:33:45 +0100 Subject: [PATCH 09/17] revert doc changes --- docs/source/en/api/pipelines/audioldm2.md | 52 +++++++++++++++++------ 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/docs/source/en/api/pipelines/audioldm2.md b/docs/source/en/api/pipelines/audioldm2.md index 9d055d4019b2..f32ed6acdd80 100644 --- a/docs/source/en/api/pipelines/audioldm2.md +++ b/docs/source/en/api/pipelines/audioldm2.md @@ -20,10 +20,10 @@ Inspired by [Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelin is a text-to-audio _latent diffusion model (LDM)_ that learns continuous audio representations from text embeddings. Two text encoder models are used to compute the text embeddings from a prompt input: the text-branch of [CLAP](https://huggingface.co/docs/transformers/main/en/model_doc/clap) and the encoder of [Flan-T5](https://huggingface.co/docs/transformers/main/en/model_doc/flan-t5). These text embeddings -are then projected to a shared embedding space by an [AudioLDM2ProjectionModel](https://huggingface.co/docs/diffusers/main/api/pipelines/audioldm2#diffusers.AudioLDM2ProjectionModel). +are then projected to a shared embedding space by an [AudioLDM2ProjectionModel](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2/AudioLDM2ProjectionModel). A [GPT2](https://huggingface.co/docs/transformers/main/en/model_doc/gpt2) _language model (LM)_ is used to auto-regressively predict eight new embedding vectors, conditional on the projected CLAP and Flan-T5 embeddings. The generated embedding -vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The [UNet](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2UNet2DConditionModel) +vectors and Flan-T5 text embeddings are used as cross-attention conditioning in the LDM. The [UNet](https://huggingface.co/docs/diffusers/api/pipelines/audioldm2/AudioLDM2UNet2DConditionModel) of AudioLDM 2 is unique in the sense that it takes **two** cross-attention embeddings, as opposed to one cross-attention conditioning, as in most other LDMs. @@ -38,17 +38,13 @@ found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2). ### Choosing a checkpoint -AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio -generation. The third checkpoint is trained exclusively on text-to-music generation. +AudioLDM2 comes in three variants. Two of these checkpoints are applicable to the general task of text-to-audio generation. The third checkpoint is trained exclusively on text-to-music generation. See table below for details on the three official checkpoints: -All checkpoints share the same model size for the text encoders and VAE. They differ in the size and depth of the UNet. -See table below for details on the three checkpoints: - -| Checkpoint | Task | UNet Model Size | Total Model Size | Training Data / h | -|-----------------------------------------------------------------|---------------|-----------------|------------------|-------------------| -| [audioldm2](https://huggingface.co/cvssp/audioldm2) | Text-to-audio | 350M | 1.1B | 1150k | -| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 750M | 1.5B | 1150k | -| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 350M | 1.1B | 665k | +| Checkpoint | Task | Model Size | Training Data / h | +|-----------------------------------------------------------------|---------------|------------|-------------------| +| [audioldm2](https://huggingface.co/cvssp/audioldm2) | Text-to-audio | 1.1B | 1150k | +| [audioldm2-music](https://huggingface.co/cvssp/audioldm2-music) | Text-to-music | 1.1B | 665k | +| [audioldm2-large](https://huggingface.co/cvssp/audioldm2-large) | Text-to-audio | 1.5B | 1150k | ### Constructing a prompt @@ -66,7 +62,37 @@ See table below for details on the three checkpoints: * The quality of the generated waveforms can vary significantly based on the seed. Try generating with different seeds until you find a satisfactory generation * Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly. -The following example demonstrates how to construct good music generation using the aforementioned tips: [example](https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2#diffusers.AudioLDM2Pipeline.__call__.example). +The following example demonstrates how to construct good music generation using the aforementioned tips: + +```python +import scipy +import torch +from diffusers import AudioLDM2Pipeline + +# load the best weights for music generation +repo_id = "cvssp/audioldm2-music" +pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16) +pipe = pipe.to("cuda") + +# define the prompts +prompt = "Techno music with a strong, upbeat tempo and high melodic riffs" +negative_prompt = "Low quality." + +# set the seed +generator = torch.Generator("cuda").manual_seed(0) + +# run the generation +audio = pipe( + prompt, + negative_prompt=negative_prompt, + num_inference_steps=200, + audio_length_in_s=10.0, + num_waveforms_per_prompt=3, +).audios + +# save the best audio sample (index 0) as a .wav file +scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0]) +``` From 1c84471f6f1cb545b59b299d8376c7b0e656b8c8 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 12:45:10 +0100 Subject: [PATCH 10/17] remove dtype test --- tests/pipelines/audioldm2/test_audioldm2.py | 23 --------------------- 1 file changed, 23 deletions(-) diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py index 2b85c5e8a763..39d8a12434d1 100644 --- a/tests/pipelines/audioldm2/test_audioldm2.py +++ b/tests/pipelines/audioldm2/test_audioldm2.py @@ -469,29 +469,6 @@ def test_save_load_optional_components(self): # increase tolerance from 1e-4 -> 2e-4 to account for large composite model super().test_save_load_optional_components(expected_max_difference=2e-4) - def test_to_dtype(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - # The method component.dtype returns the dtype of the first parameter registered in the model, not the - # dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale) - model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} - self.assertTrue(model_dtypes["text_encoder"] == torch.float64) - - # Without the logit scale parameters, everything is float32 - model_dtypes.pop("text_encoder") - self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values())) - - # the CLAP sub-models are float32 - model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype - self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values())) - - # Once we send to fp16, all params are in half-precision, including the logit scale - pipe.to(torch_dtype=torch.float16) - model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} - self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values())) - @slow class AudioLDM2PipelineSlowTests(unittest.TestCase): From 5d420c9c8a5cf1ea7cf5ef590ce017dd26ff9798 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 13:24:16 +0100 Subject: [PATCH 11/17] make style --- .../pipelines/audioldm2/pipeline_audioldm2.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 3d11014ae1f9..4f2ec5ff2dc3 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -68,12 +68,12 @@ >>> # run the generation >>> audio = pipe( - >>> prompt, - >>> negative_prompt=negative_prompt, - >>> num_inference_steps=200, - >>> audio_length_in_s=10.0, - >>> num_waveforms_per_prompt=3, - >>> ).audios + ... prompt, + ... negative_prompt=negative_prompt, + ... num_inference_steps=200, + ... audio_length_in_s=10.0, + ... num_waveforms_per_prompt=3, + ... ).audios >>> # save the best audio sample (index 0) as a .wav file >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0]) From d0e22e4520d5d510e1ab4178251c108f692bc150 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 14:16:41 +0100 Subject: [PATCH 12/17] remove docstring updates --- .../pipelines/audioldm2/pipeline_audioldm2.py | 29 ++++--------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index 4f2ec5ff2dc3..ed11d9780d28 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -51,32 +51,19 @@ EXAMPLE_DOC_STRING = """ Examples: ```py - >>> import scipy - >>> import torch >>> from diffusers import AudioLDM2Pipeline + >>> import torch + >>> import scipy >>> repo_id = "cvssp/audioldm2" >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16) >>> pipe = pipe.to("cuda") - >>> # define the prompts - >>> prompt = "The sound of a hammer hitting a wooden surface." - >>> negative_prompt = "Low quality." + >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs" + >>> audio = pipe(prompt, num_inference_steps=200, audio_length_in_s=10.0).audios[0] - >>> # set the seed - >>> generator = torch.Generator("cuda").manual_seed(0) - - >>> # run the generation - >>> audio = pipe( - ... prompt, - ... negative_prompt=negative_prompt, - ... num_inference_steps=200, - ... audio_length_in_s=10.0, - ... num_waveforms_per_prompt=3, - ... ).audios - - >>> # save the best audio sample (index 0) as a .wav file - >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0]) + >>> # save the audio sample as a .wav file + >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio) ``` """ @@ -328,7 +315,6 @@ def encode_prompt( Example: ```python - >>> import scipy >>> import torch >>> from diffusers import AudioLDM2Pipeline @@ -351,9 +337,6 @@ def encode_prompt( ... num_inference_steps=200, ... audio_length_in_s=10.0, ... ).audios[0] - - >>> # save generated audio sample - >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio) ```""" if prompt is not None and isinstance(prompt, str): batch_size = 1 From 4efa070797b81fec2ae8900bf2f3ce6d1777cc73 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Wed, 23 Aug 2023 14:18:04 +0100 Subject: [PATCH 13/17] remove unet docstring update --- src/diffusers/pipelines/audioldm2/modeling_audioldm2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index 27295054a680..37b9d9c2d1fb 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -659,7 +659,7 @@ def forward( encoder_attention_mask_1: Optional[torch.Tensor] = None, ) -> Union[UNet2DConditionOutput, Tuple]: r""" - The [`AudioLDM2UNet2DConditionModel`] forward method. + The [`UNet2DConditionModel`] forward method. Args: sample (`torch.FloatTensor`): From 0fb7aad4bdf4d8cdc758b8af51f2658b4f1a77e6 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Thu, 24 Aug 2023 15:12:37 +0100 Subject: [PATCH 14/17] Empty commit to re-trigger CI From 76a52b521a7b0dd668b653d14472091194db9d4a Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Fri, 25 Aug 2023 09:24:49 +0100 Subject: [PATCH 15/17] fix cpu offload --- src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py index ed11d9780d28..7c7dac545c5c 100644 --- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py @@ -194,12 +194,15 @@ def enable_model_cpu_offload(self, gpu_id=0): torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) model_sequence = [ - self.text_encoder, + self.text_encoder.text_model, + self.text_encoder.text_projection, self.text_encoder_2, self.projection_model, self.language_model, self.unet, self.vae, + self.vocoder, + self.text_encoder, ] hook = None From f4b74995d1535f1f0f8762f46b3dacef2cfe0f6d Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Fri, 25 Aug 2023 10:07:07 +0100 Subject: [PATCH 16/17] fix dtype test --- tests/pipelines/audioldm2/test_audioldm2.py | 24 ++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py index 39d8a12434d1..fdfcd3065e50 100644 --- a/tests/pipelines/audioldm2/test_audioldm2.py +++ b/tests/pipelines/audioldm2/test_audioldm2.py @@ -44,7 +44,7 @@ LMSDiscreteScheduler, PNDMScheduler, ) -from diffusers.utils import is_xformers_available, slow, torch_device +from diffusers.utils import is_accelerate_available, is_accelerate_version, is_xformers_available, slow, torch_device from diffusers.utils.testing_utils import enable_full_determinism from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS @@ -469,6 +469,28 @@ def test_save_load_optional_components(self): # increase tolerance from 1e-4 -> 2e-4 to account for large composite model super().test_save_load_optional_components(expected_max_difference=2e-4) + def test_to_dtype(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + + # The method component.dtype returns the dtype of the first parameter registered in the model, not the + # dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale) + model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} + + # Without the logit scale parameters, everything is float32 + model_dtypes.pop("text_encoder") + self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values())) + + # the CLAP sub-models are float32 + model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype + self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values())) + + # Once we send to fp16, all params are in half-precision, including the logit scale + pipe.to(torch_dtype=torch.float16) + model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} + self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values())) + @slow class AudioLDM2PipelineSlowTests(unittest.TestCase): From 6def17af21d40ad612f330e24675eb00f0f69d5c Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Fri, 25 Aug 2023 10:07:23 +0100 Subject: [PATCH 17/17] add offload test --- tests/pipelines/audioldm2/test_audioldm2.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py index fdfcd3065e50..942aec70d7cb 100644 --- a/tests/pipelines/audioldm2/test_audioldm2.py +++ b/tests/pipelines/audioldm2/test_audioldm2.py @@ -491,6 +491,26 @@ def test_to_dtype(self): model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")} self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values())) + @unittest.skipIf( + torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"), + reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher", + ) + def test_model_cpu_offload(self, expected_max_diff=2e-4): + components = self.get_dummy_components() + audioldm_pipe = AudioLDM2Pipeline(**components) + audioldm_pipe = audioldm_pipe.to(torch_device) + audioldm_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + output_without_offload = audioldm_pipe(**inputs)[0] + + audioldm_pipe.enable_model_cpu_offload() + inputs = self.get_dummy_inputs(torch_device) + output_with_offload = audioldm_pipe(**inputs)[0] + + max_diff = np.abs(output_with_offload - output_without_offload).max() + self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results") + @slow class AudioLDM2PipelineSlowTests(unittest.TestCase):