From 23db15eb8567fcb79f2615efa5e022629199221b Mon Sep 17 00:00:00 2001 From: Sandeep Kamath Date: Thu, 23 Mar 2023 19:49:57 +0530 Subject: [PATCH 1/4] Remove suggestion to use cuDNN benchmark in docs --- docs/source/en/optimization/fp16.mdx | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/docs/source/en/optimization/fp16.mdx b/docs/source/en/optimization/fp16.mdx index c18cefbde6a9..94dda03e15e5 100644 --- a/docs/source/en/optimization/fp16.mdx +++ b/docs/source/en/optimization/fp16.mdx @@ -20,7 +20,6 @@ We'll discuss how the following settings impact performance and memory. | ---------------- | ------- | ------- | | original | 9.50s | x1 | | cuDNN auto-tuner | 9.37s | x1.01 | -| fp16 | 3.61s | x2.63 | | channels last | 3.30s | x2.88 | | traced UNet | 3.21s | x2.96 | | memory efficient attention | 2.63s | x3.61 | @@ -31,18 +30,6 @@ We'll discuss how the following settings impact performance and memory. steps. -## Enable cuDNN auto-tuner - -[NVIDIA cuDNN](https://developer.nvidia.com/cudnn) supports many algorithms to compute a convolution. Autotuner runs a short benchmark and selects the kernel with the best performance on a given hardware for a given input size. - -Since we’re using **convolutional networks** (other types currently not supported), we can enable cuDNN autotuner before launching the inference by setting: - -```python -import torch - -torch.backends.cudnn.benchmark = True -``` - ### Use tf32 instead of fp32 (on Ampere and later CUDA devices) On Ampere and later CUDA devices matrix multiplications and convolutions can use the TensorFloat32 (TF32) mode for faster but slightly less accurate computations. By default PyTorch enables TF32 mode for convolutions but not matrix multiplications, and unless a network requires full float32 precision we recommend enabling this setting for matrix multiplications, too. It can significantly speed up computations with typically negligible loss of numerical accuracy. You can read more about it [here](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32). All you need to do is to add this before your inference: From 75f47e224b6c142f114ad16f369931e4c9d69b8f Mon Sep 17 00:00:00 2001 From: Sandeep Kamath Date: Thu, 23 Mar 2023 20:01:06 +0530 Subject: [PATCH 2/4] removing the wrong line --- docs/source/en/optimization/fp16.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/optimization/fp16.mdx b/docs/source/en/optimization/fp16.mdx index 94dda03e15e5..9d7c0234cdda 100644 --- a/docs/source/en/optimization/fp16.mdx +++ b/docs/source/en/optimization/fp16.mdx @@ -19,7 +19,7 @@ We'll discuss how the following settings impact performance and memory. | | Latency | Speedup | | ---------------- | ------- | ------- | | original | 9.50s | x1 | -| cuDNN auto-tuner | 9.37s | x1.01 | +| fp16 | 3.61s | x2.63 | | channels last | 3.30s | x2.88 | | traced UNet | 3.21s | x2.96 | | memory efficient attention | 2.63s | x3.61 | From 461b82096c1c493028d5fd2bd3ed17b6881ddc46 Mon Sep 17 00:00:00 2001 From: Sandeep Kamath Date: Wed, 29 Mar 2023 19:59:45 +0530 Subject: [PATCH 3/4] add support for embeds --- .../pipeline_stable_diffusion_upscale.py | 54 +++++++++++++++++-- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 9f8f44a12bb4..6b6d7475057f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -317,10 +317,50 @@ def decode_latents(self, latents): image = image.cpu().permute(0, 2, 3, 1).float().numpy() return image - def check_inputs(self, prompt, image, noise_level, callback_steps): - if not isinstance(prompt, str) and not isinstance(prompt, list): + def check_inputs( + self, + prompt, + image, + noise_level, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + if ( not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image) @@ -480,13 +520,19 @@ def __call__( """ # 1. Check inputs - self.check_inputs(prompt, image, noise_level, callback_steps) + self.check_inputs(prompt, image, noise_level, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) if image is None: raise ValueError("`image` input cannot be undefined.") # 2. Define call parameters - batch_size = 1 if isinstance(prompt, str) else len(prompt) + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + device = self._execution_device # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` From 656f76ced2154a5be73b161007f6caa7b350cca1 Mon Sep 17 00:00:00 2001 From: Sandeep Kamath Date: Thu, 30 Mar 2023 00:04:46 +0530 Subject: [PATCH 4/4] fix line length --- .../pipeline_stable_diffusion_upscale.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 6b6d7475057f..9f55d9cc188a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -520,7 +520,15 @@ def __call__( """ # 1. Check inputs - self.check_inputs(prompt, image, noise_level, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + self.check_inputs( + prompt, + image, + noise_level, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ) if image is None: raise ValueError("`image` input cannot be undefined.")