From b7471a0cf7d8b7336f73489579e1d57ab60f49dc Mon Sep 17 00:00:00 2001
From: Pie31415 <andy999coc@gmail.com>
Date: Mon, 13 Mar 2023 18:40:05 -0400
Subject: [PATCH 01/10] updated black format

---
 .../textual_inversion_bf16.py                 |  5 +-
 .../text_to_image/train_text_to_image.py      |  2 +
 .../textual_inversion/textual_inversion.py    |  5 +-
 .../textual_inversion/textual_inversion.py    |  5 +-
 .../textual_inversion_flax.py                 |  5 +-
 src/diffusers/models/autoencoder_kl.py        |  8 ++
 src/diffusers/models/vae.py                   | 74 +++++++++++++++----
 7 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py b/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
index f446efc0b0c0..99fce231c590 100644
--- a/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
+++ b/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
@@ -336,10 +336,7 @@ def __getitem__(self, i):
 
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
+            (h, w,) = (
                 img.shape[0],
                 img.shape[1],
             )
diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
index f91aa2f8d29b..6f150daa1eb0 100644
--- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
+++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
@@ -412,6 +412,8 @@ def main():
 
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
+        vae.enable_gradient_checkpointing()
+        vae.requires_grad_(True)
 
     # Enable TF32 for faster training on Ampere GPUs,
     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
diff --git a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py
index 9f3c7a42707b..680a61cd9bb1 100644
--- a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py
+++ b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py
@@ -443,10 +443,7 @@ def __getitem__(self, i):
 
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
+            (h, w,) = (
                 img.shape[0],
                 img.shape[1],
             )
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index 47f5e60d5fd2..bc0b5d47709c 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -441,10 +441,7 @@ def __getitem__(self, i):
 
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
+            (h, w,) = (
                 img.shape[0],
                 img.shape[1],
             )
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index c23fa4f5d38a..5935e4a9f46d 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -306,10 +306,7 @@ def __getitem__(self, i):
 
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
+            (h, w,) = (
                 img.shape[0],
                 img.shape[1],
             )
diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py
index 9cb0a4b2432b..c2a7f422344b 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoder_kl.py
@@ -121,6 +121,10 @@ def __init__(
         self.tile_latent_min_size = int(sample_size / (2 ** (len(self.block_out_channels) - 1)))
         self.tile_overlap_factor = 0.25
 
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, Decoder)):
+            module.gradient_checkpointing = value
+
     def enable_tiling(self, use_tiling: bool = True):
         r"""
         Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
@@ -204,6 +208,8 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen
         Args:
         When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
         steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is:
+            
+
         different from non-tiled encoding due to each tile using a different encoder. To avoid tiling artifacts, the
         tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
         look of the output, but they should be much less noticeable.
@@ -250,6 +256,8 @@ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[
         Args:
         When this option is enabled, the VAE will split the input tensor into tiles to compute decoding in several
         steps. This is useful to keep memory use constant regardless of image size. The end result of tiled decoding is:
+            
+
         different from non-tiled decoding due to each tile using a different decoder. To avoid tiling artifacts, the
         tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
         look of the output, but they should be much less noticeable.
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index c5142a8f15b7..01ed8a434c94 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -25,9 +25,8 @@
 @dataclass
 class DecoderOutput(BaseOutput):
     """
-    Output of decoding method.
-
     Args:
+    Output of decoding method.
         sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Decoded output sample of the model. Output of the last layer of the model.
     """
@@ -50,7 +49,13 @@ def __init__(
         super().__init__()
         self.layers_per_block = layers_per_block
 
-        self.conv_in = torch.nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+        self.conv_in = torch.nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
 
         self.mid_block = None
         self.down_blocks = nn.ModuleList([])
@@ -96,16 +101,34 @@ def __init__(
         conv_out_channels = 2 * out_channels if double_z else out_channels
         self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)
 
+        self.gradient_checkpointing = False
+
     def forward(self, x):
         sample = x
         sample = self.conv_in(sample)
 
-        # down
-        for down_block in self.down_blocks:
-            sample = down_block(sample)
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # down
+            for down_block in self.down_blocks:
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+
+            # middle
+            sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
 
-        # middle
-        sample = self.mid_block(sample)
+        else:
+            # down
+            for down_block in self.down_blocks:
+                sample = down_block(sample)
+
+            # middle
+            sample = self.mid_block(sample)
 
         # post-process
         sample = self.conv_norm_out(sample)
@@ -129,7 +152,13 @@ def __init__(
         super().__init__()
         self.layers_per_block = layers_per_block
 
-        self.conv_in = nn.Conv2d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
 
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
@@ -176,16 +205,33 @@ def __init__(
         self.conv_act = nn.SiLU()
         self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
 
+        self.gradient_checkpointing = False
+
     def forward(self, z):
         sample = z
         sample = self.conv_in(sample)
 
-        # middle
-        sample = self.mid_block(sample)
+        if self.training and self.gradient_checkpointing:
 
-        # up
-        for up_block in self.up_blocks:
-            sample = up_block(sample)
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # middle
+            sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+
+            # up
+            for up_block in self.up_blocks:
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample)
+        else:
+            # middle
+            sample = self.mid_block(sample)
+
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample)
 
         # post-process
         sample = self.conv_norm_out(sample)

From fd876d6998509bf1a4b3d923bb2084dd52970523 Mon Sep 17 00:00:00 2001
From: Pie31415 <andy999coc@gmail.com>
Date: Mon, 13 Mar 2023 18:47:24 -0400
Subject: [PATCH 02/10] update black format

---
 .../textual_inversion/textual_inversion_bf16.py   |  5 ++++-
 .../textual_inversion/textual_inversion.py        |  5 ++++-
 examples/textual_inversion/textual_inversion.py   |  5 ++++-
 .../textual_inversion/textual_inversion_flax.py   |  5 ++++-
 src/diffusers/models/autoencoder_kl.py            | 15 ++++++++++++---
 5 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py b/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
index 99fce231c590..f446efc0b0c0 100644
--- a/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
+++ b/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py
@@ -336,7 +336,10 @@ def __getitem__(self, i):
 
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
-            (h, w,) = (
+            (
+                h,
+                w,
+            ) = (
                 img.shape[0],
                 img.shape[1],
             )
diff --git a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py
index 680a61cd9bb1..9f3c7a42707b 100644
--- a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py
+++ b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py
@@ -443,7 +443,10 @@ def __getitem__(self, i):
 
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
-            (h, w,) = (
+            (
+                h,
+                w,
+            ) = (
                 img.shape[0],
                 img.shape[1],
             )
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index bc0b5d47709c..47f5e60d5fd2 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -441,7 +441,10 @@ def __getitem__(self, i):
 
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
-            (h, w,) = (
+            (
+                h,
+                w,
+            ) = (
                 img.shape[0],
                 img.shape[1],
             )
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index 5935e4a9f46d..c23fa4f5d38a 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -306,7 +306,10 @@ def __getitem__(self, i):
 
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
-            (h, w,) = (
+            (
+                h,
+                w,
+            ) = (
                 img.shape[0],
                 img.shape[1],
             )
diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py
index c2a7f422344b..550b0550933d 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoder_kl.py
@@ -225,7 +225,12 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen
         for i in range(0, x.shape[2], overlap_size):
             row = []
             for j in range(0, x.shape[3], overlap_size):
-                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = x[
+                    :,
+                    :,
+                    i : i + self.tile_sample_min_size,
+                    j : j + self.tile_sample_min_size,
+                ]
                 tile = self.encoder(tile)
                 tile = self.quant_conv(tile)
                 row.append(tile)
@@ -257,7 +262,6 @@ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[
         When this option is enabled, the VAE will split the input tensor into tiles to compute decoding in several
         steps. This is useful to keep memory use constant regardless of image size. The end result of tiled decoding is:
             
-
         different from non-tiled decoding due to each tile using a different decoder. To avoid tiling artifacts, the
         tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
         look of the output, but they should be much less noticeable.
@@ -275,7 +279,12 @@ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[
         for i in range(0, z.shape[2], overlap_size):
             row = []
             for j in range(0, z.shape[3], overlap_size):
-                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                tile = z[
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
                 tile = self.post_quant_conv(tile)
                 decoded = self.decoder(tile)
                 row.append(decoded)

From a7f9da1d476717b18c7f7cf8514a0031577ac260 Mon Sep 17 00:00:00 2001
From: Pie31415 <andy999coc@gmail.com>
Date: Mon, 13 Mar 2023 18:50:18 -0400
Subject: [PATCH 03/10] make style format

---
 src/diffusers/models/autoencoder_kl.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py
index 550b0550933d..a3a9ba32ac71 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoder_kl.py
@@ -210,6 +210,7 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen
         steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is:
             
 
+
         different from non-tiled encoding due to each tile using a different encoder. To avoid tiling artifacts, the
         tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
         look of the output, but they should be much less noticeable.
@@ -262,6 +263,7 @@ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[
         When this option is enabled, the VAE will split the input tensor into tiles to compute decoding in several
         steps. This is useful to keep memory use constant regardless of image size. The end result of tiled decoding is:
             
+
         different from non-tiled decoding due to each tile using a different decoder. To avoid tiling artifacts, the
         tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
         look of the output, but they should be much less noticeable.

From d023b5e9984a68fd68d31dc1bd22ffdd35c484a9 Mon Sep 17 00:00:00 2001
From: Pie31415 <andy999coc@gmail.com>
Date: Mon, 13 Mar 2023 18:59:53 -0400
Subject: [PATCH 04/10] updated line endings

---
 src/diffusers/models/autoencoder_kl.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py
index a3a9ba32ac71..5c059b3bf25c 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoder_kl.py
@@ -208,9 +208,6 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen
         Args:
         When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
         steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is:
-            
-
-
         different from non-tiled encoding due to each tile using a different encoder. To avoid tiling artifacts, the
         tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
         look of the output, but they should be much less noticeable.
@@ -262,8 +259,6 @@ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[
         Args:
         When this option is enabled, the VAE will split the input tensor into tiles to compute decoding in several
         steps. This is useful to keep memory use constant regardless of image size. The end result of tiled decoding is:
-            
-
         different from non-tiled decoding due to each tile using a different decoder. To avoid tiling artifacts, the
         tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
         look of the output, but they should be much less noticeable.

From de4f5cd00aa0cd7b042f03415f3088f976f071c8 Mon Sep 17 00:00:00 2001
From: Pie31415 <andy999coc@gmail.com>
Date: Mon, 13 Mar 2023 19:01:58 -0400
Subject: [PATCH 05/10] update code formatting

---
 src/diffusers/models/autoencoder_kl.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py
index 5c059b3bf25c..db3d9f8b49bc 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoder_kl.py
@@ -223,12 +223,7 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen
         for i in range(0, x.shape[2], overlap_size):
             row = []
             for j in range(0, x.shape[3], overlap_size):
-                tile = x[
-                    :,
-                    :,
-                    i : i + self.tile_sample_min_size,
-                    j : j + self.tile_sample_min_size,
-                ]
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
                 tile = self.encoder(tile)
                 tile = self.quant_conv(tile)
                 row.append(tile)
@@ -276,12 +271,7 @@ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[
         for i in range(0, z.shape[2], overlap_size):
             row = []
             for j in range(0, z.shape[3], overlap_size):
-                tile = z[
-                    :,
-                    :,
-                    i : i + self.tile_latent_min_size,
-                    j : j + self.tile_latent_min_size,
-                ]
+                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
                 tile = self.post_quant_conv(tile)
                 decoded = self.decoder(tile)
                 row.append(decoded)

From dcb23405b38a13e15d429e852cb552f6368b33f4 Mon Sep 17 00:00:00 2001
From: Andy <37781802+Pie31415@users.noreply.github.com>
Date: Tue, 14 Mar 2023 14:13:48 -0400
Subject: [PATCH 06/10] Update
 examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../onnxruntime/text_to_image/train_text_to_image.py             | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
index 6f150daa1eb0..637b35b3f695 100644
--- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
+++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py
@@ -413,7 +413,6 @@ def main():
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
         vae.enable_gradient_checkpointing()
-        vae.requires_grad_(True)
 
     # Enable TF32 for faster training on Ampere GPUs,
     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices

From 7ed451357ec7257d5f80a85b4b868c8e94875e52 Mon Sep 17 00:00:00 2001
From: Andy <37781802+Pie31415@users.noreply.github.com>
Date: Tue, 14 Mar 2023 14:20:18 -0400
Subject: [PATCH 07/10] Update src/diffusers/models/vae.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/vae.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 01ed8a434c94..fac1aa764bea 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -25,6 +25,8 @@
 @dataclass
 class DecoderOutput(BaseOutput):
     """
+    Output of decoding method.
+
     Args:
     Output of decoding method.
         sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):

From 7ec6ccffd436e5b8addca062357e6c01868e5124 Mon Sep 17 00:00:00 2001
From: Andy <37781802+Pie31415@users.noreply.github.com>
Date: Tue, 14 Mar 2023 14:20:28 -0400
Subject: [PATCH 08/10] Update src/diffusers/models/vae.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/vae.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index fac1aa764bea..b4484823ac3d 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -28,7 +28,6 @@ class DecoderOutput(BaseOutput):
     Output of decoding method.
 
     Args:
-    Output of decoding method.
         sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
             Decoded output sample of the model. Output of the last layer of the model.
     """

From a4343f08e49733bb40b9ed0e2b6cfdf0d4a037c9 Mon Sep 17 00:00:00 2001
From: Pie31415 <andy999coc@gmail.com>
Date: Thu, 16 Mar 2023 14:43:12 -0400
Subject: [PATCH 09/10] added vae gradient checkpointing test

---
 src/diffusers/models/autoencoder_kl.py |  2 ++
 tests/models/test_models_vae.py        | 41 ++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py
index 1e515323e031..9c0161065e4c 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoder_kl.py
@@ -65,6 +65,8 @@ class AutoencoderKL(ModelMixin, ConfigMixin):
             Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
     """
 
+    _supports_gradient_checkpointing = True
+
     @register_to_config
     def __init__(
         self,
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index 5d0aa194c1df..3eb7ce861592 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -68,6 +68,47 @@ def test_forward_signature(self):
     def test_training(self):
         pass
 
+    @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS")
+    def test_gradient_checkpointing(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        assert not model.is_gradient_checkpointing and model.training
+
+        out = model(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model.zero_grad()
+
+        labels = torch.randn_like(out)
+        loss = (out - labels).mean()
+        loss.backward()
+
+        # re-instantiate the model now enabling gradient checkpointing
+        model_2 = self.model_class(**init_dict)
+        # clone model
+        model_2.load_state_dict(model.state_dict())
+        model_2.to(torch_device)
+        model_2.enable_gradient_checkpointing()
+
+        assert model_2.is_gradient_checkpointing and model_2.training
+
+        out_2 = model_2(**inputs_dict).sample
+        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
+        # we won't calculate the loss and rather backprop on out.sum()
+        model_2.zero_grad()
+        loss_2 = (out_2 - labels).mean()
+        loss_2.backward()
+
+        # compare the output and parameters gradients
+        self.assertTrue((loss - loss_2).abs() < 1e-5)
+        named_params = dict(model.named_parameters())
+        named_params_2 = dict(model_2.named_parameters())
+        for name, param in named_params.items():
+            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
+
     def test_from_pretrained_hub(self):
         model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
         self.assertIsNotNone(model)

From 5bbbf1c8493ca331853516810fab7c90970638bf Mon Sep 17 00:00:00 2001
From: Pie31415 <andy999coc@gmail.com>
Date: Thu, 16 Mar 2023 14:50:45 -0400
Subject: [PATCH 10/10] make style

---
 src/diffusers/models/cross_attention.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/diffusers/models/cross_attention.py b/src/diffusers/models/cross_attention.py
index 1bb4ad2f4a67..df88fff98316 100644
--- a/src/diffusers/models/cross_attention.py
+++ b/src/diffusers/models/cross_attention.py
@@ -24,9 +24,7 @@
     SlicedAttnProcessor,
     XFormersAttnProcessor,
 )
-from .attention_processor import (  # noqa: F401
-    AttnProcessor as AttnProcessorRename,
-)
+from .attention_processor import AttnProcessor as AttnProcessorRename  # noqa: F401
 
 
 deprecate(