From 7664874de92e7940911309f4c613e7dea67ed78d Mon Sep 17 00:00:00 2001 From: duongna21 Date: Thu, 3 Nov 2022 17:23:10 +0700 Subject: [PATCH 01/33] Add ldm super resolution pipeline --- src/diffusers/__init__.py | 1 + src/diffusers/pipelines/__init__.py | 2 +- .../pipelines/latent_diffusion/__init__.py | 1 + ...peline_latent_diffusion_superresolution.py | 143 ++++++++++++++++++ 4 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 49c3e82b8e7b..c8a090086459 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -61,6 +61,7 @@ if is_torch_available() and is_transformers_available(): from .pipelines import ( LDMTextToImagePipeline, + LDMSuperResolutionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index b3124af39077..a0033041008b 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -13,7 +13,7 @@ from ..utils.dummy_pt_objects import * # noqa F403 if is_torch_available() and is_transformers_available(): - from .latent_diffusion import LDMTextToImagePipeline + from .latent_diffusion import LDMTextToImagePipeline, LDMSuperResolutionPipeline from .stable_diffusion import ( StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, diff --git a/src/diffusers/pipelines/latent_diffusion/__init__.py b/src/diffusers/pipelines/latent_diffusion/__init__.py index c481b38cf5e0..3442222dfa77 100644 --- a/src/diffusers/pipelines/latent_diffusion/__init__.py +++ b/src/diffusers/pipelines/latent_diffusion/__init__.py @@ -4,3 +4,4 @@ if is_transformers_available(): from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline + from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py new file mode 100644 index 000000000000..cd117fa4559b --- /dev/null +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -0,0 +1,143 @@ +import inspect +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint + +import PIL + +from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel +from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler + + +def preprocess(image): + w, h = image.size + w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 + image = image.resize((w, h), resample=PIL.Image.LANCZOS) + image = np.array(image).astype(np.float32) / 255.0 + image = image[None].transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + return 2.0 * image - 1.0 + + +class LDMSuperResolutionPipeline(DiffusionPipeline): + r""" + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Parameters: + vqvae ([`VQModel`]): + Vector-quantized (VQ) Model to encode and decode images to and from latent representations. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + """ + + def __init__( + self, + vqvae: Union[VQModel, AutoencoderKL], + unet: Union[UNet2DModel, UNet2DConditionModel], + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + ): + super().__init__() + self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler) + + @torch.no_grad() + def __call__( + self, + init_image: Union[torch.FloatTensor, PIL.Image.Image], + batch_size: Optional[int] = 1, + num_inference_steps: Optional[int] = 100, + eta: Optional[float] = 0.0, + generator: Optional[torch.Generator] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + **kwargs, + ) -> Union[Tuple, ImagePipelineOutput]: + r""" + Args: + init_image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + batch_size (`int`, *optional*, defaults to 1): + Number of images to generate. + height (`int`, *optional*, defaults to 256): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 256): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + generator (`torch.Generator`, *optional*): + A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation + deterministic. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*): + Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + + Returns: + [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if + `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the + generated images. + """ + + if isinstance(init_image, PIL.Image.Image): + batch_size = 1 + elif isinstance(init_image, torch.FloatTensor): + batch_size = init_image.shape[0] + else: + raise ValueError( + f"`init_image` has to be of type `PIL.Image.Image` or `torch.FloatTensor` but is {type(init_image)}" + ) + + if isinstance(init_image, PIL.Image.Image): + init_image = preprocess(init_image) + + height, width = init_image.shape[-2:] + + latents = torch.randn( + (batch_size, self.unet.in_channels // 2, height, width), + device="cuda", + generator=generator, + ) + latents = latents.to(self.device) + init_image = init_image.to(device=self.device, dtype=latents.dtype) + + # set timesteps + self.scheduler.set_timesteps(num_inference_steps) + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature. + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_kwargs = {} + if accepts_eta: + extra_kwargs["eta"] = eta + + for t in self.progress_bar(self.scheduler.timesteps): + # concat latents and low resolution image + latents_input = torch.cat([latents, init_image], dim=1) + # predict the noise residual + noise_pred = self.unet(latents_input, t).sample + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample + + # decode the image latents with the VQVAE + image = self.vqvae.decode(latents).sample + image = torch.clamp(image, -1.0, 1.0) + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) \ No newline at end of file From 2d3c98a78b57b1312556c2cd74745d76413b88b6 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Thu, 3 Nov 2022 17:26:40 +0700 Subject: [PATCH 02/33] style --- src/diffusers/__init__.py | 2 +- src/diffusers/pipelines/__init__.py | 2 +- .../pipeline_latent_diffusion_superresolution.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index c8a090086459..a164ed5c0380 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -60,8 +60,8 @@ if is_torch_available() and is_transformers_available(): from .pipelines import ( - LDMTextToImagePipeline, LDMSuperResolutionPipeline, + LDMTextToImagePipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index a0033041008b..9d8a08e996a7 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -13,7 +13,7 @@ from ..utils.dummy_pt_objects import * # noqa F403 if is_torch_available() and is_transformers_available(): - from .latent_diffusion import LDMTextToImagePipeline, LDMSuperResolutionPipeline + from .latent_diffusion import LDMSuperResolutionPipeline, LDMTextToImagePipeline from .stable_diffusion import ( StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index cd117fa4559b..fee5ba371b92 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -140,4 +140,4 @@ def __call__( if not return_dict: return (image,) - return ImagePipelineOutput(images=image) \ No newline at end of file + return ImagePipelineOutput(images=image) From 0c44672bb712fc5152eac33fe8ce4b245ce9e182 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Thu, 3 Nov 2022 19:47:51 +0700 Subject: [PATCH 03/33] fix copies --- .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 615444425c44..66351bbf66f7 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -4,6 +4,21 @@ from ..utils import DummyObject, requires_backends +class LDMSuperResolutionPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class LDMTextToImagePipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] From 5519da275860c919acaff4a9762f8714000b3533 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Thu, 3 Nov 2022 19:50:03 +0700 Subject: [PATCH 04/33] style --- .../pipeline_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index fee5ba371b92..d082e70e06c9 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -1,5 +1,5 @@ import inspect -from typing import List, Optional, Tuple, Union +from typing import Optional, Tuple, Union import numpy as np import torch From 8af0ade035f2eb11245af7f1e75cf7181a67726c Mon Sep 17 00:00:00 2001 From: duongna21 Date: Fri, 4 Nov 2022 06:15:26 +0700 Subject: [PATCH 05/33] fix doc --- .../pipeline_latent_diffusion_superresolution.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index d082e70e06c9..6f1fa7a2552d 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -64,13 +64,12 @@ def __call__( process. batch_size (`int`, *optional*, defaults to 1): Number of images to generate. - height (`int`, *optional*, defaults to 256): - The height in pixels of the generated image. - width (`int`, *optional*, defaults to 256): - The width in pixels of the generated image. - num_inference_steps (`int`, *optional*, defaults to 50): + num_inference_steps (`int`, *optional*, defaults to 100): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. generator (`torch.Generator`, *optional*): A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. From 82623e06f4753c57e46e5382f3cf78d8ff821682 Mon Sep 17 00:00:00 2001 From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com> Date: Sat, 5 Nov 2022 10:24:22 +0700 Subject: [PATCH 06/33] Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py Co-authored-by: Patrick von Platen --- .../pipeline_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 6f1fa7a2552d..88cdc9f6815d 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -38,7 +38,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): def __init__( self, - vqvae: Union[VQModel, AutoencoderKL], + vqvae: VQModel, unet: Union[UNet2DModel, UNet2DConditionModel], scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], ): From 9ef5ba17befd3c176a9e2840d5c0a4007ea49fb9 Mon Sep 17 00:00:00 2001 From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com> Date: Sat, 5 Nov 2022 10:27:25 +0700 Subject: [PATCH 07/33] Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py Co-authored-by: Patrick von Platen --- .../pipeline_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 88cdc9f6815d..7d8b9c4af0e1 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -39,7 +39,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): def __init__( self, vqvae: VQModel, - unet: Union[UNet2DModel, UNet2DConditionModel], + unet: UNet2DModel, scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], ): super().__init__() From 997763666b43d6b28502c92938bc226500a6a0ac Mon Sep 17 00:00:00 2001 From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com> Date: Sat, 5 Nov 2022 10:27:37 +0700 Subject: [PATCH 08/33] Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py Co-authored-by: Suraj Patil --- .../pipeline_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 7d8b9c4af0e1..205bfa21d2cc 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -29,7 +29,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): Parameters: vqvae ([`VQModel`]): - Vector-quantized (VQ) Model to encode and decode images to and from latent representations. + Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations. unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of From 226bbc012583b25327063667af10b88f77abd5e9 Mon Sep 17 00:00:00 2001 From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com> Date: Sat, 5 Nov 2022 10:28:44 +0700 Subject: [PATCH 09/33] Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py Co-authored-by: Suraj Patil --- .../pipeline_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 205bfa21d2cc..7de810a60a75 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -120,7 +120,7 @@ def __call__( extra_kwargs["eta"] = eta for t in self.progress_bar(self.scheduler.timesteps): - # concat latents and low resolution image + # concat latents and low resolution image in the channel dimension. latents_input = torch.cat([latents, init_image], dim=1) # predict the noise residual noise_pred = self.unet(latents_input, t).sample From d52704f5efa40e55e52061117bc86402d1321cf6 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sat, 5 Nov 2022 13:54:47 +0700 Subject: [PATCH 10/33] add doc --- .../api/pipelines/ldm_superresolution.mdx | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 docs/source/api/pipelines/ldm_superresolution.mdx diff --git a/docs/source/api/pipelines/ldm_superresolution.mdx b/docs/source/api/pipelines/ldm_superresolution.mdx new file mode 100644 index 000000000000..2a3f3e4be97b --- /dev/null +++ b/docs/source/api/pipelines/ldm_superresolution.mdx @@ -0,0 +1,42 @@ + + +# Super Resolution using Latent Diffusion + +## Overview + +Super Resolution using Latent Diffusion was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer. + +The abstract of the paper is the following: + +*By decomposing the image formation process into a sequential application of denoising autoencoders, diffusion models (DMs) achieve state-of-the-art synthesis results on image data and beyond. Additionally, their formulation allows for a guiding mechanism to control the image generation process without retraining. However, since these models typically operate directly in pixel space, optimization of powerful DMs often consumes hundreds of GPU days and inference is expensive due to sequential evaluations. To enable DM training on limited computational resources while retaining their quality and flexibility, we apply them in the latent space of powerful pretrained autoencoders. In contrast to previous work, training diffusion models on such a representation allows for the first time to reach a near-optimal point between complexity reduction and detail preservation, greatly boosting visual fidelity. By introducing cross-attention layers into the model architecture, we turn diffusion models into powerful and flexible generators for general conditioning inputs such as text or bounding boxes and high-resolution synthesis becomes possible in a convolutional manner. Our latent diffusion models (LDMs) achieve a new state of the art for image inpainting and highly competitive performance on various tasks, including unconditional image generation, semantic scene synthesis, and super-resolution, while significantly reducing computational requirements compared to pixel-based DMs.* + +The original codebase can be found [here](https://github.com/CompVis/latent-diffusion). + +## Tips: + +- +- +- + +## Available Pipelines: + +| Pipeline | Tasks | Colab +|---|---|:---:| +| [pipeline_latent_diffusion_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py) | *Super Resolution* | - | + +## Examples: + + +## LDMSuperResolutionPipeline +[[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion_superresolution.LDMSuperResolutionPipeline + - __call__ From 16584f78f8636bb1cc25f918152abef7bb752a7a Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sat, 5 Nov 2022 15:06:30 +0700 Subject: [PATCH 11/33] address comments --- .../community/composable_stable_diffusion.py | 2 +- examples/community/imagic_stable_diffusion.py | 2 +- .../community/interpolate_stable_diffusion.py | 2 +- examples/community/lpw_stable_diffusion.py | 2 +- .../community/seed_resize_stable_diffusion.py | 2 +- examples/community/stable_diffusion_mega.py | 2 +- .../community/wildcard_stable_diffusion.py | 2 +- src/diffusers/__init__.py | 2 +- .../pipelines/latent_diffusion/__init__.py | 2 +- .../pipeline_latent_diffusion.py | 2 +- ...peline_latent_diffusion_superresolution.py | 40 ++++++++++++++----- .../pipeline_latent_diffusion_uncond.py | 2 +- .../pipeline_flax_stable_diffusion.py | 2 +- .../pipeline_onnx_stable_diffusion_img2img.py | 2 +- .../pipeline_onnx_stable_diffusion_inpaint.py | 2 +- .../pipeline_stable_diffusion.py | 2 +- .../pipeline_stable_diffusion_img2img.py | 2 +- .../pipeline_stable_diffusion_inpaint.py | 2 +- ...ipeline_stable_diffusion_inpaint_legacy.py | 2 +- src/diffusers/utils/dummy_pt_objects.py | 15 +++++++ .../dummy_torch_and_transformers_objects.py | 30 -------------- 21 files changed, 63 insertions(+), 58 deletions(-) diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py index 10d34d255a20..eb207e1bdd47 100644 --- a/examples/community/composable_stable_diffusion.py +++ b/examples/community/composable_stable_diffusion.py @@ -32,7 +32,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offsensive or harmful. diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 92aa677b4626..0c95fb435848 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -54,7 +54,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offsensive or harmful. diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py index bbb1b0f9e633..7dcd623ef0f8 100644 --- a/examples/community/interpolate_stable_diffusion.py +++ b/examples/community/interpolate_stable_diffusion.py @@ -65,7 +65,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index 74aed2fec86f..ccebd90b2c8e 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -396,7 +396,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py index f912663a687d..92cd1c04f9f3 100644 --- a/examples/community/seed_resize_stable_diffusion.py +++ b/examples/community/seed_resize_stable_diffusion.py @@ -37,7 +37,7 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py index 723951941576..67112b282b67 100644 --- a/examples/community/stable_diffusion_mega.py +++ b/examples/community/stable_diffusion_mega.py @@ -42,7 +42,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionMegaSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py index 79cb4feb1e7c..5df43269ca1f 100644 --- a/examples/community/wildcard_stable_diffusion.py +++ b/examples/community/wildcard_stable_diffusion.py @@ -99,7 +99,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index a164ed5c0380..8e53ba2005c9 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -35,6 +35,7 @@ DDPMPipeline, KarrasVePipeline, LDMPipeline, + LDMSuperResolutionPipeline, PNDMPipeline, ScoreSdeVePipeline, ) @@ -60,7 +61,6 @@ if is_torch_available() and is_transformers_available(): from .pipelines import ( - LDMSuperResolutionPipeline, LDMTextToImagePipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, diff --git a/src/diffusers/pipelines/latent_diffusion/__init__.py b/src/diffusers/pipelines/latent_diffusion/__init__.py index 3442222dfa77..5544527ff587 100644 --- a/src/diffusers/pipelines/latent_diffusion/__init__.py +++ b/src/diffusers/pipelines/latent_diffusion/__init__.py @@ -1,7 +1,7 @@ # flake8: noqa from ...utils import is_transformers_available +from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline if is_transformers_available(): from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline - from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py index d8948862845d..cfa39949130e 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py @@ -32,7 +32,7 @@ class LDMTextToImagePipeline(DiffusionPipeline): [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. """ diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 7de810a60a75..86706c1b99b5 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -7,9 +7,15 @@ import PIL -from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel +from ...models import UNet2DModel, VQModel from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput -from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler +from ...schedulers import ( + DDIMScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) def preprocess(image): @@ -30,17 +36,20 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): Parameters: vqvae ([`VQModel`]): Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations. - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of - [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], + [`EulerAncestralDiscreteScheduler`], or [`PNDMScheduler`]. """ def __init__( self, vqvae: VQModel, unet: UNet2DModel, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + scheduler: Union[ + DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler + ], ): super().__init__() self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler) @@ -99,17 +108,27 @@ def __call__( height, width = init_image.shape[-2:] + latents_dtype = self.unet.dtype + # in_channels should be 6: 3 for latents, 3 for low resolution image latents = torch.randn( (batch_size, self.unet.in_channels // 2, height, width), - device="cuda", + device=self.device, generator=generator, + dtype=latents_dtype, ) latents = latents.to(self.device) - init_image = init_image.to(device=self.device, dtype=latents.dtype) + init_image = init_image.to(device=self.device, dtype=latents_dtype) # set timesteps self.scheduler.set_timesteps(num_inference_steps) + # Some schedulers like PNDM have timesteps as arrays + # It's more optimized to move all timesteps to correct device beforehand + timesteps_tensor = self.scheduler.timesteps.to(self.device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature. # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 @@ -119,9 +138,10 @@ def __call__( if accepts_eta: extra_kwargs["eta"] = eta - for t in self.progress_bar(self.scheduler.timesteps): + for t in self.progress_bar(timesteps_tensor): # concat latents and low resolution image in the channel dimension. latents_input = torch.cat([latents, init_image], dim=1) + latents_input = self.scheduler.scale_model_input(latents_input, t) # predict the noise residual noise_pred = self.unet(latents_input, t).sample # compute the previous noisy sample x_t -> x_t-1 @@ -130,7 +150,7 @@ def __call__( # decode the image latents with the VQVAE image = self.vqvae.decode(latents).sample image = torch.clamp(image, -1.0, 1.0) - image = (image / 2 + 0.5).clamp(0, 1) + image = image / 2 + 0.5 image = image.cpu().permute(0, 2, 3, 1).numpy() if output_type == "pil": diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py index a7ffb4adc99a..c8da6f193e15 100644 --- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py @@ -18,7 +18,7 @@ class LDMPipeline(DiffusionPipeline): Vector-quantized (VQ) Model to encode and decode images to and from latent representations. unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latens. + [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents. """ def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py index fe0e284c6720..b406fd914ae3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py @@ -42,7 +42,7 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], or [`FlaxPNDMScheduler`]. safety_checker ([`FlaxStableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 5e6b2e6f2fc3..04ecdbecc6d5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -46,7 +46,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py index 2ce9831a1676..517242921d92 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py @@ -59,7 +59,7 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 3c1eb734a49d..7a4642182a7f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -43,7 +43,7 @@ class StableDiffusionPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index e61fb27acc1e..dfecfc6abf21 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -54,7 +54,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index bbe6ee60832c..3bc40cae9656 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -55,7 +55,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 6db3624177c6..a1cb42934c7d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -62,7 +62,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 5dd583279708..edf00ed30a66 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -212,6 +212,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class LDMSuperResolutionPipeline(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class PNDMPipeline(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 66351bbf66f7..4fcb481e9e06 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -4,36 +4,6 @@ from ..utils import DummyObject, requires_backends -class LDMSuperResolutionPipeline(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - -class LDMTextToImagePipeline(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - class StableDiffusionImg2ImgPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] From 003e1859fa4e859000c7f7cd7c0d510061fd0a6d Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sat, 5 Nov 2022 15:16:51 +0700 Subject: [PATCH 12/33] address comments --- docs/source/api/pipelines/ldm_superresolution.mdx | 4 ++-- .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/docs/source/api/pipelines/ldm_superresolution.mdx b/docs/source/api/pipelines/ldm_superresolution.mdx index 2a3f3e4be97b..7f9b2adb5185 100644 --- a/docs/source/api/pipelines/ldm_superresolution.mdx +++ b/docs/source/api/pipelines/ldm_superresolution.mdx @@ -10,11 +10,11 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Super Resolution using Latent Diffusion +# Super Resolution with Latent Diffusion Model ## Overview -Super Resolution using Latent Diffusion was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer. +Super Resolution with Latent Diffusion Model was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer. The abstract of the paper is the following: diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 4fcb481e9e06..615444425c44 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -4,6 +4,21 @@ from ..utils import DummyObject, requires_backends +class LDMTextToImagePipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class StableDiffusionImg2ImgPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] From e360ce62acd16ed9c3d016ca1f102d5b92fbaf4c Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sat, 5 Nov 2022 15:33:24 +0700 Subject: [PATCH 13/33] fix doc --- .../source/api/pipelines/latent_diffusion.mdx | 5 +++ .../api/pipelines/ldm_superresolution.mdx | 42 ------------------- 2 files changed, 5 insertions(+), 42 deletions(-) delete mode 100644 docs/source/api/pipelines/ldm_superresolution.mdx diff --git a/docs/source/api/pipelines/latent_diffusion.mdx b/docs/source/api/pipelines/latent_diffusion.mdx index 6d63cd5cbe5b..4ade13e67b15 100644 --- a/docs/source/api/pipelines/latent_diffusion.mdx +++ b/docs/source/api/pipelines/latent_diffusion.mdx @@ -33,6 +33,7 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff | Pipeline | Tasks | Colab |---|---|:---:| | [pipeline_latent_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py) | *Text-to-Image Generation* | - | +| [pipeline_latent_diffusion_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py) | *Super Resolution* | - | ## Examples: @@ -40,3 +41,7 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff ## LDMTextToImagePipeline [[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion.LDMTextToImagePipeline - __call__ + +## LDMSuperResolutionPipeline +[[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion_superresolution.LDMSuperResolutionPipeline + - __call__ diff --git a/docs/source/api/pipelines/ldm_superresolution.mdx b/docs/source/api/pipelines/ldm_superresolution.mdx deleted file mode 100644 index 7f9b2adb5185..000000000000 --- a/docs/source/api/pipelines/ldm_superresolution.mdx +++ /dev/null @@ -1,42 +0,0 @@ - - -# Super Resolution with Latent Diffusion Model - -## Overview - -Super Resolution with Latent Diffusion Model was proposed in [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) by Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, Björn Ommer. - -The abstract of the paper is the following: - -*By decomposing the image formation process into a sequential application of denoising autoencoders, diffusion models (DMs) achieve state-of-the-art synthesis results on image data and beyond. Additionally, their formulation allows for a guiding mechanism to control the image generation process without retraining. However, since these models typically operate directly in pixel space, optimization of powerful DMs often consumes hundreds of GPU days and inference is expensive due to sequential evaluations. To enable DM training on limited computational resources while retaining their quality and flexibility, we apply them in the latent space of powerful pretrained autoencoders. In contrast to previous work, training diffusion models on such a representation allows for the first time to reach a near-optimal point between complexity reduction and detail preservation, greatly boosting visual fidelity. By introducing cross-attention layers into the model architecture, we turn diffusion models into powerful and flexible generators for general conditioning inputs such as text or bounding boxes and high-resolution synthesis becomes possible in a convolutional manner. Our latent diffusion models (LDMs) achieve a new state of the art for image inpainting and highly competitive performance on various tasks, including unconditional image generation, semantic scene synthesis, and super-resolution, while significantly reducing computational requirements compared to pixel-based DMs.* - -The original codebase can be found [here](https://github.com/CompVis/latent-diffusion). - -## Tips: - -- -- -- - -## Available Pipelines: - -| Pipeline | Tasks | Colab -|---|---|:---:| -| [pipeline_latent_diffusion_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py) | *Super Resolution* | - | - -## Examples: - - -## LDMSuperResolutionPipeline -[[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion_superresolution.LDMSuperResolutionPipeline - - __call__ From d189eeaa273ea77c6ebdbadb44e2b1c7d4568390 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sat, 5 Nov 2022 15:47:41 +0700 Subject: [PATCH 14/33] minor --- src/diffusers/pipelines/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 9d8a08e996a7..e3fd167bec13 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -5,6 +5,7 @@ from .dance_diffusion import DanceDiffusionPipeline from .ddim import DDIMPipeline from .ddpm import DDPMPipeline + from .latent_diffusion import LDMSuperResolutionPipeline from .latent_diffusion_uncond import LDMPipeline from .pndm import PNDMPipeline from .score_sde_ve import ScoreSdeVePipeline @@ -13,7 +14,7 @@ from ..utils.dummy_pt_objects import * # noqa F403 if is_torch_available() and is_transformers_available(): - from .latent_diffusion import LDMSuperResolutionPipeline, LDMTextToImagePipeline + from .latent_diffusion import LDMTextToImagePipeline from .stable_diffusion import ( StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, From b2d5e21eb94a629c0d69193e0af127a6d4dea4a0 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sat, 5 Nov 2022 22:29:53 +0700 Subject: [PATCH 15/33] add tests --- .../test_latent_diffusion_superresolution.py | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py new file mode 100644 index 000000000000..0e98cee5e958 --- /dev/null +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -0,0 +1,117 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +import random + +from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel +from diffusers.utils.testing_utils import require_torch +from diffusers.utils import floats_tensor, load_image, slow, torch_device + +from ...test_pipelines_common import PipelineTesterMixin + + +torch.backends.cuda.matmul.allow_tf32 = False + + +class LDMSuperResolutionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + @property + def dummy_image(self): + batch_size = 1 + num_channels = 3 + sizes = (32, 32) + + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) + return image + + @property + def dummy_uncond_unet(self): + torch.manual_seed(0) + model = UNet2DModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=6, + out_channels=3, + down_block_types=("DownBlock2D", "AttnDownBlock2D"), + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) + return model + + @property + def dummy_vq_model(self): + torch.manual_seed(0) + model = VQModel( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=3, + ) + return model + + def test_inference_superresolution(self): + unet = self.dummy_uncond_unet + scheduler = DDIMScheduler() + vqvae = self.dummy_vq_model + + ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler) + ldm.to(torch_device) + ldm.set_progress_bar_config(disable=None) + + init_image = self.dummy_image.to(torch_device) + + # Warmup pass when using mps (see #372) + if torch_device == "mps": + generator = torch.manual_seed(0) + _ = ldm(init_image, generator=generator, num_inference_steps=1, output_type="numpy").images + + generator = torch.manual_seed(0) + image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images + + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + expected_slice = np.array([0.8678, 0.8245, 0.6382, 0.6831, 0.4385, 0.56, 0.4641, 0.6202, 0.515]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + +@slow +@require_torch +class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase): + def test_inference_superresolution(self): + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/vq_diffusion/teddy_bear_pool.png" + ) + + ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution", device_map="auto") + ldm.to(torch_device) + ldm.set_progress_bar_config(disable=None) + + generator = torch.Generator(device=torch_device).manual_seed(0) + image = ldm( + init_image, generator=generator, num_inference_steps=20, output_type="numpy" + ).images + + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 1024, 1024, 3) + expected_slice = np.array([0.726, 0.7249, 0.7085, 0.774, 0.7419, 0.7188, 0.8359, 0.8031, 0.7158]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 \ No newline at end of file From 4ca74e8e9b29af7cf2aab5a0995ec573ac14ae22 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sat, 5 Nov 2022 22:31:27 +0700 Subject: [PATCH 16/33] add tests --- .../test_latent_diffusion_superresolution.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index 0e98cee5e958..c5a0ff7d87f9 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -13,15 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import random import unittest import numpy as np import torch -import random from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel -from diffusers.utils.testing_utils import require_torch from diffusers.utils import floats_tensor, load_image, slow, torch_device +from diffusers.utils.testing_utils import require_torch from ...test_pipelines_common import PipelineTesterMixin @@ -106,12 +106,10 @@ def test_inference_superresolution(self): ldm.set_progress_bar_config(disable=None) generator = torch.Generator(device=torch_device).manual_seed(0) - image = ldm( - init_image, generator=generator, num_inference_steps=20, output_type="numpy" - ).images + image = ldm(init_image, generator=generator, num_inference_steps=20, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 1024, 1024, 3) expected_slice = np.array([0.726, 0.7249, 0.7085, 0.774, 0.7419, 0.7188, 0.8359, 0.8031, 0.7158]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 \ No newline at end of file + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 From 69daedcf07f681be58dbb9fc2882b8e14cf3d4e8 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sat, 5 Nov 2022 22:58:11 +0700 Subject: [PATCH 17/33] load text encoder from subfolder --- .../pipeline_latent_diffusion_superresolution.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 86706c1b99b5..6608d5ce9637 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -57,7 +57,7 @@ def __init__( @torch.no_grad() def __call__( self, - init_image: Union[torch.FloatTensor, PIL.Image.Image], + init_image: Union[torch.Tensor, PIL.Image.Image], batch_size: Optional[int] = 1, num_inference_steps: Optional[int] = 100, eta: Optional[float] = 0.0, @@ -68,7 +68,7 @@ def __call__( ) -> Union[Tuple, ImagePipelineOutput]: r""" Args: - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + init_image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. batch_size (`int`, *optional*, defaults to 1): @@ -96,11 +96,11 @@ def __call__( if isinstance(init_image, PIL.Image.Image): batch_size = 1 - elif isinstance(init_image, torch.FloatTensor): + elif isinstance(init_image, torch.Tensor): batch_size = init_image.shape[0] else: raise ValueError( - f"`init_image` has to be of type `PIL.Image.Image` or `torch.FloatTensor` but is {type(init_image)}" + f"`init_image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(init_image)}" ) if isinstance(init_image, PIL.Image.Image): From ac78735401dd7172ca3235b1b7230af8459b251b Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sat, 5 Nov 2022 23:35:50 +0700 Subject: [PATCH 18/33] fix test --- .../latent_diffusion/test_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index c5a0ff7d87f9..f2ec66bd76a3 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -88,7 +88,7 @@ def test_inference_superresolution(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.8678, 0.8245, 0.6382, 0.6831, 0.4385, 0.56, 0.4641, 0.6202, 0.515]) + expected_slice = np.array([0.8559, 0.823, 0.6421, 0.6939, 0.4467, 0.5723, 0.4763, 0.6302, 0.5196]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 From 9c5134c609a0f67f1a77171aa4f4d35cdf9aa083 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sun, 6 Nov 2022 09:45:06 +0700 Subject: [PATCH 19/33] fix test --- .../latent_diffusion/test_latent_diffusion_superresolution.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index f2ec66bd76a3..b01592a252ab 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -86,9 +86,10 @@ def test_inference_superresolution(self): image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] + print(f'image_slice: {image_slice}') assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.8559, 0.823, 0.6421, 0.6939, 0.4467, 0.5723, 0.4763, 0.6302, 0.5196]) + expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 From 711555725f07e1839890f4585694b346049c37ca Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sun, 6 Nov 2022 09:46:59 +0700 Subject: [PATCH 20/33] style --- .../latent_diffusion/test_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index b01592a252ab..7c3f8cdddfd2 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -86,7 +86,7 @@ def test_inference_superresolution(self): image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] - print(f'image_slice: {image_slice}') + print(f"image_slice: {image_slice}") assert image.shape == (1, 64, 64, 3) expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172]) From afc44621c45c9188b91e15bf4ace051895d2c043 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sun, 6 Nov 2022 09:47:10 +0700 Subject: [PATCH 21/33] style --- .../latent_diffusion/test_latent_diffusion_superresolution.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index 7c3f8cdddfd2..b2e6daefad57 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -86,7 +86,6 @@ def test_inference_superresolution(self): image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] - print(f"image_slice: {image_slice}") assert image.shape == (1, 64, 64, 3) expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172]) From 5708a2c0940dac432069ce483559bb8877065163 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Sun, 6 Nov 2022 17:32:52 +0700 Subject: [PATCH 22/33] handle mps latents --- ...peline_latent_diffusion_superresolution.py | 19 +++++++++++-------- .../test_latent_diffusion_superresolution.py | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 6608d5ce9637..00b9c97f0fa3 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -108,15 +108,18 @@ def __call__( height, width = init_image.shape[-2:] - latents_dtype = self.unet.dtype # in_channels should be 6: 3 for latents, 3 for low resolution image - latents = torch.randn( - (batch_size, self.unet.in_channels // 2, height, width), - device=self.device, - generator=generator, - dtype=latents_dtype, - ) - latents = latents.to(self.device) + latents_shape = (batch_size, self.unet.in_channels // 2, height, width) + latents_dtype = self.unet.dtype + + if self.device.type == "mps": + # randn does not work reproducibly on mps + latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to( + self.device + ) + else: + latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype) + init_image = init_image.to(device=self.device, dtype=latents_dtype) # set timesteps diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index b2e6daefad57..139dac086bbd 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -88,7 +88,7 @@ def test_inference_superresolution(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172]) + expected_slice = np.array([0.8534, 0.8186, 0.6416, 0.6846, 0.4427, 0.5676, 0.4679, 0.6247, 0.5176]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 From b4fbb2b99455a0b9bfd6538133de30454b135b1a Mon Sep 17 00:00:00 2001 From: duongna21 Date: Mon, 7 Nov 2022 18:37:12 +0700 Subject: [PATCH 23/33] unfix typo --- examples/community/composable_stable_diffusion.py | 4 ++-- examples/community/imagic_stable_diffusion.py | 4 ++-- examples/community/interpolate_stable_diffusion.py | 4 ++-- examples/community/lpw_stable_diffusion.py | 4 ++-- examples/community/seed_resize_stable_diffusion.py | 4 ++-- examples/community/stable_diffusion_mega.py | 4 ++-- examples/community/wildcard_stable_diffusion.py | 4 ++-- .../pipelines/latent_diffusion/pipeline_latent_diffusion.py | 4 ++-- .../pipeline_latent_diffusion_superresolution.py | 2 +- .../pipeline_latent_diffusion_uncond.py | 4 ++-- src/diffusers/pipelines/pndm/pipeline_pndm.py | 2 +- .../stable_diffusion/pipeline_flax_stable_diffusion.py | 4 ++-- .../pipeline_onnx_stable_diffusion_img2img.py | 4 ++-- .../pipeline_onnx_stable_diffusion_inpaint.py | 4 ++-- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 4 ++-- .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 4 ++-- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 4 ++-- .../pipeline_stable_diffusion_inpaint_legacy.py | 4 ++-- 18 files changed, 34 insertions(+), 34 deletions(-) diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py index eb207e1bdd47..e92c8c8d870f 100644 --- a/examples/community/composable_stable_diffusion.py +++ b/examples/community/composable_stable_diffusion.py @@ -30,9 +30,9 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offsensive or harmful. diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 0c95fb435848..d6b3eb150d5e 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -52,9 +52,9 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offsensive or harmful. diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py index 7dcd623ef0f8..f0ee046abd8b 100644 --- a/examples/community/interpolate_stable_diffusion.py +++ b/examples/community/interpolate_stable_diffusion.py @@ -63,9 +63,9 @@ class StableDiffusionWalkPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index ccebd90b2c8e..d117322405e5 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -394,9 +394,9 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py index 92cd1c04f9f3..3cb59d900cf3 100644 --- a/examples/community/seed_resize_stable_diffusion.py +++ b/examples/community/seed_resize_stable_diffusion.py @@ -35,9 +35,9 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py index 67112b282b67..4a97389a0b3d 100644 --- a/examples/community/stable_diffusion_mega.py +++ b/examples/community/stable_diffusion_mega.py @@ -40,9 +40,9 @@ class StableDiffusionMegaPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionMegaSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py index 5df43269ca1f..48093b709d86 100644 --- a/examples/community/wildcard_stable_diffusion.py +++ b/examples/community/wildcard_stable_diffusion.py @@ -97,9 +97,9 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py index cfa39949130e..d7512310431d 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py @@ -30,9 +30,9 @@ class LDMTextToImagePipeline(DiffusionPipeline): tokenizer (`transformers.BertTokenizer`): Tokenizer of class [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. """ diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 00b9c97f0fa3..5058fa27d092 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -38,7 +38,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations. unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`], or [`PNDMScheduler`]. """ diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py index c8da6f193e15..eb5d89be1e1c 100644 --- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py @@ -16,9 +16,9 @@ class LDMPipeline(DiffusionPipeline): Parameters: vqvae ([`VQModel`]): Vector-quantized (VQ) Model to encode and decode images to and from latent representations. - unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents. + [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latens. """ def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler): diff --git a/src/diffusers/pipelines/pndm/pipeline_pndm.py b/src/diffusers/pipelines/pndm/pipeline_pndm.py index f360da09ac94..6472af1211c4 100644 --- a/src/diffusers/pipelines/pndm/pipeline_pndm.py +++ b/src/diffusers/pipelines/pndm/pipeline_pndm.py @@ -29,7 +29,7 @@ class PNDMPipeline(DiffusionPipeline): library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) Parameters: - unet (`UNet2DModel`): U-Net architecture to denoise the encoded image latents. + unet (`UNet2DModel`): U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): The `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image. """ diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py index b406fd914ae3..500196953062 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py @@ -40,9 +40,9 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], or [`FlaxPNDMScheduler`]. safety_checker ([`FlaxStableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 04ecdbecc6d5..927e99bd6952 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -44,9 +44,9 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py index 517242921d92..3f20219e3c7d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py @@ -57,9 +57,9 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 7a4642182a7f..57c7dc2e3ec9 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -41,9 +41,9 @@ class StableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index dfecfc6abf21..e62a7bbfb799 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -52,9 +52,9 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 3bc40cae9656..7abbf13ed7fc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -53,9 +53,9 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index a1cb42934c7d..838da6c410b3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -60,9 +60,9 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. From f02b34be30c423fb991eac935632e106ee9e8576 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Mon, 7 Nov 2022 18:40:13 +0700 Subject: [PATCH 24/33] unfix typo --- examples/community/composable_stable_diffusion.py | 2 +- examples/community/imagic_stable_diffusion.py | 2 +- examples/community/interpolate_stable_diffusion.py | 2 +- examples/community/lpw_stable_diffusion.py | 2 +- examples/community/seed_resize_stable_diffusion.py | 2 +- examples/community/stable_diffusion_mega.py | 2 +- examples/community/wildcard_stable_diffusion.py | 2 +- .../pipelines/latent_diffusion/pipeline_latent_diffusion.py | 2 +- .../latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py | 2 +- src/diffusers/pipelines/pndm/pipeline_pndm.py | 2 +- .../stable_diffusion/pipeline_flax_stable_diffusion.py | 2 +- .../stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py | 2 +- .../stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py | 2 +- .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 2 +- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 2 +- .../pipeline_stable_diffusion_inpaint_legacy.py | 2 +- 17 files changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py index e92c8c8d870f..10d34d255a20 100644 --- a/examples/community/composable_stable_diffusion.py +++ b/examples/community/composable_stable_diffusion.py @@ -30,7 +30,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index d6b3eb150d5e..92aa677b4626 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -52,7 +52,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py index f0ee046abd8b..bbb1b0f9e633 100644 --- a/examples/community/interpolate_stable_diffusion.py +++ b/examples/community/interpolate_stable_diffusion.py @@ -63,7 +63,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index d117322405e5..74aed2fec86f 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -394,7 +394,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py index 3cb59d900cf3..f912663a687d 100644 --- a/examples/community/seed_resize_stable_diffusion.py +++ b/examples/community/seed_resize_stable_diffusion.py @@ -35,7 +35,7 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py index 4a97389a0b3d..723951941576 100644 --- a/examples/community/stable_diffusion_mega.py +++ b/examples/community/stable_diffusion_mega.py @@ -40,7 +40,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py index 48093b709d86..79cb4feb1e7c 100644 --- a/examples/community/wildcard_stable_diffusion.py +++ b/examples/community/wildcard_stable_diffusion.py @@ -97,7 +97,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py index d7512310431d..d8948862845d 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py @@ -30,7 +30,7 @@ class LDMTextToImagePipeline(DiffusionPipeline): tokenizer (`transformers.BertTokenizer`): Tokenizer of class [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py index eb5d89be1e1c..a7ffb4adc99a 100644 --- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py @@ -16,7 +16,7 @@ class LDMPipeline(DiffusionPipeline): Parameters: vqvae ([`VQModel`]): Vector-quantized (VQ) Model to encode and decode images to and from latent representations. - unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latens. """ diff --git a/src/diffusers/pipelines/pndm/pipeline_pndm.py b/src/diffusers/pipelines/pndm/pipeline_pndm.py index 6472af1211c4..f360da09ac94 100644 --- a/src/diffusers/pipelines/pndm/pipeline_pndm.py +++ b/src/diffusers/pipelines/pndm/pipeline_pndm.py @@ -29,7 +29,7 @@ class PNDMPipeline(DiffusionPipeline): library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) Parameters: - unet (`UNet2DModel`): U-Net architecture to denoise the encoded image latens. + unet (`UNet2DModel`): U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): The `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image. """ diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py index 500196953062..fe0e284c6720 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py @@ -40,7 +40,7 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], or [`FlaxPNDMScheduler`]. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 927e99bd6952..5e6b2e6f2fc3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -44,7 +44,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py index 3f20219e3c7d..2ce9831a1676 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py @@ -57,7 +57,7 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 57c7dc2e3ec9..3c1eb734a49d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -41,7 +41,7 @@ class StableDiffusionPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index e62a7bbfb799..e61fb27acc1e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -52,7 +52,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 7abbf13ed7fc..bbe6ee60832c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -53,7 +53,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 838da6c410b3..6db3624177c6 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -60,7 +60,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline): tokenizer (`CLIPTokenizer`): Tokenizer of class [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latens. + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. From 9606d01502680227ab5764b3ec33ef75d7f38d57 Mon Sep 17 00:00:00 2001 From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com> Date: Tue, 8 Nov 2022 15:32:44 +0700 Subject: [PATCH 25/33] Update tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py Co-authored-by: Pedro Cuenca --- .../latent_diffusion/test_latent_diffusion_superresolution.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index 139dac086bbd..c6d0715f4b82 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -89,7 +89,8 @@ def test_inference_superresolution(self): assert image.shape == (1, 64, 64, 3) expected_slice = np.array([0.8534, 0.8186, 0.6416, 0.6846, 0.4427, 0.5676, 0.4679, 0.6247, 0.5176]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + tolerance = 1e-2 if torch_device != "mps" else 3e-2 + assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance @slow From dc7de8041cc42e4d6a39035d8583af1c233ce533 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Tue, 8 Nov 2022 15:36:01 +0700 Subject: [PATCH 26/33] fix set_timesteps mps --- .../pipeline_latent_diffusion_superresolution.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 5058fa27d092..04763fc6e6b0 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -122,12 +122,9 @@ def __call__( init_image = init_image.to(device=self.device, dtype=latents_dtype) - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - - # Some schedulers like PNDM have timesteps as arrays - # It's more optimized to move all timesteps to correct device beforehand - timesteps_tensor = self.scheduler.timesteps.to(self.device) + # set timesteps and move to the correct device + self.scheduler.set_timesteps(num_inference_steps, device=self.device) + timesteps_tensor = self.scheduler.timesteps # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma From 6f985433c19bf74bfcefb0f80d14975995b7aa0a Mon Sep 17 00:00:00 2001 From: duongna21 Date: Tue, 8 Nov 2022 15:38:00 +0700 Subject: [PATCH 27/33] fix set_timesteps mps --- .../latent_diffusion/test_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index c6d0715f4b82..8ba05d61913f 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -88,7 +88,7 @@ def test_inference_superresolution(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.8534, 0.8186, 0.6416, 0.6846, 0.4427, 0.5676, 0.4679, 0.6247, 0.5176]) + expected_slice = np.array([0.8634, 0.8186, 0.6416, 0.6846, 0.4427, 0.5676, 0.4679, 0.6247, 0.5176]) tolerance = 1e-2 if torch_device != "mps" else 3e-2 assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance From 3f6e1fa6124863e0812a1c9266648549afcd12b2 Mon Sep 17 00:00:00 2001 From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com> Date: Tue, 8 Nov 2022 18:42:27 +0700 Subject: [PATCH 28/33] Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py Co-authored-by: Suraj Patil --- .../pipeline_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 04763fc6e6b0..936efb7c4181 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -48,7 +48,7 @@ def __init__( vqvae: VQModel, unet: UNet2DModel, scheduler: Union[ - DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler + DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler ], ): super().__init__() From ef0c0916399ccc44e9f57346f4ab41fd685fc4fb Mon Sep 17 00:00:00 2001 From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com> Date: Tue, 8 Nov 2022 18:43:21 +0700 Subject: [PATCH 29/33] Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py Co-authored-by: Suraj Patil --- .../pipeline_latent_diffusion_superresolution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 936efb7c4181..146e010e6af8 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -110,7 +110,7 @@ def __call__( # in_channels should be 6: 3 for latents, 3 for low resolution image latents_shape = (batch_size, self.unet.in_channels // 2, height, width) - latents_dtype = self.unet.dtype + latents_dtype = next(self.unet.parameters()).dtype if self.device.type == "mps": # randn does not work reproducibly on mps From 47593f29317b94bb0976e2c9f56e82ead0522cb3 Mon Sep 17 00:00:00 2001 From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com> Date: Tue, 8 Nov 2022 18:44:17 +0700 Subject: [PATCH 30/33] Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py Co-authored-by: Suraj Patil --- .../pipeline_latent_diffusion_superresolution.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 146e010e6af8..4b6f77657a24 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -114,9 +114,8 @@ def __call__( if self.device.type == "mps": # randn does not work reproducibly on mps - latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to( - self.device - ) + latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype) + latents = latents.to(self.device) else: latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype) From 1f808a1fffb2ed7909d3f56fc48cdc850644a6bb Mon Sep 17 00:00:00 2001 From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com> Date: Tue, 8 Nov 2022 18:44:31 +0700 Subject: [PATCH 31/33] Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py Co-authored-by: Suraj Patil --- .../pipeline_latent_diffusion_superresolution.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 4b6f77657a24..ff8b8b29ff2c 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -30,7 +30,9 @@ def preprocess(image): class LDMSuperResolutionPipeline(DiffusionPipeline): r""" - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + A pipeline for image super-resolution using Latent + + This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) Parameters: From 5308ff58d25b40af9d9ea0fb898bfe925ee61312 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Tue, 8 Nov 2022 18:50:54 +0700 Subject: [PATCH 32/33] style --- .../pipeline_latent_diffusion_superresolution.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index ff8b8b29ff2c..044ff359e317 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -11,6 +11,7 @@ from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import ( DDIMScheduler, + DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler, @@ -30,7 +31,7 @@ def preprocess(image): class LDMSuperResolutionPipeline(DiffusionPipeline): r""" - A pipeline for image super-resolution using Latent + A pipeline for image super-resolution using Latent This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) @@ -42,7 +43,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], - [`EulerAncestralDiscreteScheduler`], or [`PNDMScheduler`]. + [`EulerAncestralDiscreteScheduler`], [`DPMSolverMultistepScheduler`], or [`PNDMScheduler`]. """ def __init__( @@ -50,7 +51,12 @@ def __init__( vqvae: VQModel, unet: UNet2DModel, scheduler: Union[ - DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler + DDIMScheduler, + PNDMScheduler, + LMSDiscreteScheduler, + EulerDiscreteScheduler, + EulerAncestralDiscreteScheduler, + DPMSolverMultistepScheduler, ], ): super().__init__() @@ -117,7 +123,7 @@ def __call__( if self.device.type == "mps": # randn does not work reproducibly on mps latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype) - latents = latents.to(self.device) + latents = latents.to(self.device) else: latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype) From 6f122a7e7bb41437c76a8b58b5e43950fef865d6 Mon Sep 17 00:00:00 2001 From: duongna21 Date: Tue, 8 Nov 2022 20:46:39 +0700 Subject: [PATCH 33/33] test 64x64 instead of 256x256 --- .../test_latent_diffusion_superresolution.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index 8ba05d61913f..f5ec56d1bdc2 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -19,6 +19,7 @@ import numpy as np import torch +import PIL from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel from diffusers.utils import floats_tensor, load_image, slow, torch_device from diffusers.utils.testing_utils import require_torch @@ -101,6 +102,7 @@ def test_inference_superresolution(self): "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/vq_diffusion/teddy_bear_pool.png" ) + init_image = init_image.resize((64, 64), resample=PIL.Image.LANCZOS) ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution", device_map="auto") ldm.to(torch_device) @@ -111,6 +113,6 @@ def test_inference_superresolution(self): image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 1024, 1024, 3) - expected_slice = np.array([0.726, 0.7249, 0.7085, 0.774, 0.7419, 0.7188, 0.8359, 0.8031, 0.7158]) + assert image.shape == (1, 256, 256, 3) + expected_slice = np.array([0.7418, 0.7472, 0.7424, 0.7422, 0.7463, 0.726, 0.7382, 0.7248, 0.6828]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2