diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index 5f1cd73dd448..7c7dac545c5c 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -194,12 +194,15 @@ def enable_model_cpu_offload(self, gpu_id=0):
             torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
 
         model_sequence = [
-            self.text_encoder,
+            self.text_encoder.text_model,
+            self.text_encoder.text_projection,
             self.text_encoder_2,
             self.projection_model,
             self.language_model,
             self.unet,
             self.vae,
+            self.vocoder,
+            self.text_encoder,
         ]
 
         hook = None
@@ -909,7 +912,8 @@ def __call__(
                     encoder_hidden_states=generated_prompt_embeds,
                     encoder_hidden_states_1=prompt_embeds,
                     encoder_attention_mask_1=attention_mask,
-                ).sample
+                    return_dict=False,
+                )[0]
 
                 # perform guidance
                 if do_classifier_free_guidance:
diff --git a/tests/pipelines/audioldm2/test_audioldm2.py b/tests/pipelines/audioldm2/test_audioldm2.py
index 7abcfbfeb3b9..942aec70d7cb 100644
--- a/tests/pipelines/audioldm2/test_audioldm2.py
+++ b/tests/pipelines/audioldm2/test_audioldm2.py
@@ -44,7 +44,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from diffusers.utils import is_xformers_available, slow, torch_device
+from diffusers.utils import is_accelerate_available, is_accelerate_version, is_xformers_available, slow, torch_device
 from diffusers.utils.testing_utils import enable_full_determinism
 
 from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
@@ -477,7 +477,6 @@ def test_to_dtype(self):
         # The method component.dtype returns the dtype of the first parameter registered in the model, not the
         # dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale)
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
-        self.assertTrue(model_dtypes["text_encoder"] == torch.float64)
 
         # Without the logit scale parameters, everything is float32
         model_dtypes.pop("text_encoder")
@@ -492,6 +491,26 @@ def test_to_dtype(self):
         model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
         self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
 
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"),
+        reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher",
+    )
+    def test_model_cpu_offload(self, expected_max_diff=2e-4):
+        components = self.get_dummy_components()
+        audioldm_pipe = AudioLDM2Pipeline(**components)
+        audioldm_pipe = audioldm_pipe.to(torch_device)
+        audioldm_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = audioldm_pipe(**inputs)[0]
+
+        audioldm_pipe.enable_model_cpu_offload()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = audioldm_pipe(**inputs)[0]
+
+        max_diff = np.abs(output_with_offload - output_without_offload).max()
+        self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
+
 
 @slow
 class AudioLDM2PipelineSlowTests(unittest.TestCase):
@@ -514,7 +533,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
         return inputs
 
     def test_audioldm2(self):
-        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("/home/sanchit/convert-audioldm2/hub-audioldm2")
+        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
 
@@ -532,7 +551,7 @@ def test_audioldm2(self):
         assert max_diff < 1e-3
 
     def test_audioldm2_lms(self):
-        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("/home/sanchit/convert-audioldm2/hub-audioldm2")
+        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
         audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)
@@ -552,7 +571,7 @@ def test_audioldm2_lms(self):
         assert max_diff < 1e-3
 
     def test_audioldm2_large(self):
-        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("/home/sanchit/convert-audioldm2/hub-audioldm2-large")
+        audioldm_pipe = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2-large")
         audioldm_pipe = audioldm_pipe.to(torch_device)
         audioldm_pipe.set_progress_bar_config(disable=None)