From 989979752a1c9e6e16e92d8f1b3d14b70829fbb5 Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Tue, 8 Aug 2023 17:20:21 -0700 Subject: [PATCH 01/10] Initial code to add force_unmasked_unchanged argument to StableDiffusionInpaintPipeline.__call__. --- .../pipeline_stable_diffusion_inpaint.py | 49 ++++++++++++++----- .../test_stable_diffusion_inpaint.py | 44 +++++++++++++++++ 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index ab2abdc05f51..38c2e79d7d32 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -585,13 +585,14 @@ def prepare_latents( "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." "However, either the image or the noise timestep has not been provided." ) + + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) if return_image_latents or (latents is None and not is_strength_max): image = image.to(device=device, dtype=dtype) image_latents = self._encode_vae_image(image=image, generator=generator) if latents is None: - noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) # if strength is 1. then initialise the latents to noise, else initial to image + noise latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) # if pure noise then scale the initial latents by the Scheduler's init sigma @@ -698,6 +699,7 @@ def __call__( callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + force_unmasked_unchanged: Optional[bool] = None, ): r""" The call function to the pipeline for generation. @@ -764,6 +766,12 @@ def __call__( cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + force_unmasked_unchanged (`bool`, *optional*): + Whether to force the unmasked areas of `image` to remain exactly the same after inpainting for a model + with 9 UNet channels. If the UNet has only 4 channels, then the unmasked areas will always be forced + to remain unchanged, and setting `force_unmasked_unchanged` to `False` in this case will raise an + error. + Examples: @@ -873,7 +881,21 @@ def __call__( # 6. Prepare latent variables num_channels_latents = self.vae.config.latent_channels num_channels_unet = self.unet.config.in_channels - return_image_latents = num_channels_unet == 4 + + # return_image_latents = num_channels_unet == 4 or force_unmasked_unchanged + if num_channels_unet == 4 and force_unmasked_unchanged is False: + raise ValueError( + "Cannot set `force_unmasked_unchanged=False` for inpainting if the UNet has only 4 input channels." + " Either set `force_unmasked_unchanged` to `True` or use a UNet checkpoint that has been trained" + " specifically for inpainting." + ) + elif num_channels_unet == 4 and force_unmasked_unchanged is None: + # For checkpoints not trained for inpainting the unmasked area should by default not be changed + force_unmasked_unchanged = True + elif num_channels_unet > 4 and force_unmasked_unchanged is None: + # For checkpoints trained for inpainting the unmasked area should by default be allowed to change so that + # the model can make the transition between the inpainted and non-inpainted area more natural + force_unmasked_unchanged = False latents_outputs = self.prepare_latents( batch_size * num_images_per_prompt, @@ -888,10 +910,10 @@ def __call__( timestep=latent_timestep, is_strength_max=is_strength_max, return_noise=True, - return_image_latents=return_image_latents, + return_image_latents=force_unmasked_unchanged, ) - if return_image_latents: + if force_unmasked_unchanged: latents, noise, image_latents = latents_outputs else: latents, noise = latents_outputs @@ -960,17 +982,18 @@ def __call__( # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - if num_channels_unet == 4: - init_latents_proper = image_latents[:1] - init_mask = mask[:1] + if num_channels_unet == 4 and i < len(timesteps) - 1: + # add noise for next timestep + noise_timestep = timesteps[i + 1] - if i < len(timesteps) - 1: - noise_timestep = timesteps[i + 1] - init_latents_proper = self.scheduler.add_noise( - init_latents_proper, noise, torch.tensor([noise_timestep]) - ) + init_latents_proper = self.scheduler.add_noise( + image_latents[:1], noise, torch.tensor([noise_timestep]) + ) - latents = (1 - init_mask) * init_latents_proper + init_mask * latents + latents = (1 - mask[:1]) * init_latents_proper + mask[:1] * latents + + if force_unmasked_unchanged and i == (len(timesteps) - 1): + latents = (1 - mask[:1]) * image_latents[:1] + mask[:1] * latents # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 16fff0c13dd2..75a0212474ba 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -251,6 +251,36 @@ def test_stable_diffusion_inpaint_strength_zero_test(self): inputs["strength"] = 0.01 with self.assertRaises(ValueError): sd_pipe(**inputs).images + + def test_stable_diffusion_inpaint_force_unmasked_unchanged(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionInpaintPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + # Get 32 x 32 image manually + image = floats_tensor((1, 3, 32, 32), rng=random.Random(0)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + original_image_slice = image[-3:, -3:, :].numpy() + init_image = Image.fromarray(np.uint8(255 * image)).convert("RGB") + + # mask_array = image + 4 + mask_array = image + 250 + # # Make some pixels unmasked + # mask_array[-3:, -3:, :] = 255 + mask_array[-3:, -3:, :] = 4 + mask_image = Image.fromarray(np.uint8(mask_array)).convert("RGB") + + inputs = self.get_dummy_inputs(device) + inputs["image"] = init_image + inputs["mask_image"] = mask_image + inputs["force_unmasked_unchanged"] = True + + image = sd_pipe(**inputs).images + + output_image_slice = image[0, -3:, -3:, :] + assert np.abs(original_image_slice.flatten() - output_image_slice.flatten()).max() < 1e-3 class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipelineFastTests): @@ -323,6 +353,20 @@ def test_stable_diffusion_inpaint(self): expected_slice = np.array([0.4925, 0.4967, 0.4100, 0.5234, 0.5322, 0.4532, 0.5805, 0.5877, 0.4151]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_stable_diffusion_inpaint_force_unmasked_unchanged_false(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionInpaintPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + + # check that the pipeline raises value error when num_unet_channels == 4 and force_masked_unchanged == False + inputs["force_unmasked_unchanged"] = False + with self.assertRaises(ValueError): + sd_pipe(**inputs).images @unittest.skip("skipped here because area stays unchanged due to mask") def test_stable_diffusion_inpaint_lora(self): From ff41cf43c5a820057289e0852fb41b3e2074d86d Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Thu, 17 Aug 2023 23:37:48 -0700 Subject: [PATCH 02/10] Try to improve StableDiffusionInpaintPipelineFastTests.get_dummy_inputs. --- .../test_stable_diffusion_inpaint.py | 55 +++++++++++-------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 75a0212474ba..616471b2c4fe 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -144,16 +144,31 @@ def get_dummy_components(self): } return components - def get_dummy_inputs(self, device, seed=0): + def get_dummy_inputs(self, device, seed=0, img_res=64, output_pil=True): # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64)) + if output_pil: + # Get random floats in [0, 1] as image + image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + mask_image = torch.ones_like(image) + # Convert image and mask_image to [0, 255] + image = 255 * image + mask_image = 255 * mask_image + # Convert to PIL image + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((img_res, img_res)) + mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB").resize((img_res, img_res)) + else: + # Get random floats in [0, 1] as image with spatial size (img_res, img_res) + image = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed)).to(device) + # Convert image to [-1, 1] + init_image = 2.0 * image - 1.0 + mask_image = torch.ones((1, 1, img_res, img_res), device=device) + if str(device).startswith("mps"): generator = torch.manual_seed(seed) else: generator = torch.Generator(device=device).manual_seed(seed) + inputs = { "prompt": "A painting of a squirrel eating a burger", "image": init_image, @@ -259,28 +274,24 @@ def test_stable_diffusion_inpaint_force_unmasked_unchanged(self): sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) - # Get 32 x 32 image manually - image = floats_tensor((1, 3, 32, 32), rng=random.Random(0)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - original_image_slice = image[-3:, -3:, :].numpy() - init_image = Image.fromarray(np.uint8(255 * image)).convert("RGB") - - # mask_array = image + 4 - mask_array = image + 250 - # # Make some pixels unmasked - # mask_array[-3:, -3:, :] = 255 - mask_array[-3:, -3:, :] = 4 - mask_image = Image.fromarray(np.uint8(mask_array)).convert("RGB") - - inputs = self.get_dummy_inputs(device) - inputs["image"] = init_image + inputs = self.get_dummy_inputs(device, output_pil=False) + init_image = inputs["image"].detach().clone() + mask_image = inputs["mask_image"].detach().clone() + # Map init_image to [0, 1] + init_image = (init_image / 2 + 0.5).clamp(0, 1) + # Get image slice + original_image_slice = init_image[0, -1, -3:, -3:].flatten().numpy() + # Unmask the same slice in mask_image + mask_image[0, 0, -3:, -3:] = 0 + inputs["mask_image"] = mask_image inputs["force_unmasked_unchanged"] = True + inputs["output_type"] = "pt" image = sd_pipe(**inputs).images + output_image_slice = image[0, -1, -3:, -3:].flatten().numpy() - output_image_slice = image[0, -3:, -3:, :] - assert np.abs(original_image_slice.flatten() - output_image_slice.flatten()).max() < 1e-3 + assert np.abs(original_image_slice - output_image_slice).max() < 1e-3 class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipelineFastTests): From 092bd0e9e9628f02ff917aafb2a5c9252776b6e9 Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Mon, 21 Aug 2023 22:13:59 -0700 Subject: [PATCH 03/10] Use original mask to preserve unmasked pixels in pixel space rather than latent space. --- .../pipeline_stable_diffusion_inpaint.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 38c2e79d7d32..2a1815cf6f8e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -982,18 +982,18 @@ def __call__( # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - if num_channels_unet == 4 and i < len(timesteps) - 1: - # add noise for next timestep - noise_timestep = timesteps[i + 1] + if num_channels_unet == 4: + init_latents_proper = image_latents[:1] + init_mask = mask[:1] - init_latents_proper = self.scheduler.add_noise( - image_latents[:1], noise, torch.tensor([noise_timestep]) - ) + if i < len(timesteps) - 1: + # add noise for next timestep + noise_timestep = timesteps[i + 1] + init_latents_proper = self.scheduler.add_noise( + init_latents_proper, noise, torch.tensor([noise_timestep]) + ) - latents = (1 - mask[:1]) * init_latents_proper + mask[:1] * latents - - if force_unmasked_unchanged and i == (len(timesteps) - 1): - latents = (1 - mask[:1]) * image_latents[:1] + mask[:1] * latents + latents = (1 - init_mask) * init_latents_proper + init_mask * latents # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): @@ -1010,6 +1010,11 @@ def __call__( mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype) condition_kwargs = {"image": init_image_condition, "mask": mask_condition} image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, **condition_kwargs)[0] + # If force_unmasked_unchanged, use the original mask in pixel space to recover the original pixels + if force_unmasked_unchanged: + # Make sure image is on CPU + image = image.cpu() + image = (1 - mask_condition[:1]) * init_image[:1] + mask_condition[:1] * image image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) else: image = latents From fa7759293af916c665f683387a52fd789ed10531 Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Mon, 21 Aug 2023 22:19:46 -0700 Subject: [PATCH 04/10] make style --- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 7 +++---- .../stable_diffusion/test_stable_diffusion_inpaint.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 2a1815cf6f8e..0c6fb0729e7c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -585,7 +585,7 @@ def prepare_latents( "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." "However, either the image or the noise timestep has not been provided." ) - + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) if return_image_latents or (latents is None and not is_strength_max): @@ -768,9 +768,8 @@ def __call__( [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). force_unmasked_unchanged (`bool`, *optional*): Whether to force the unmasked areas of `image` to remain exactly the same after inpainting for a model - with 9 UNet channels. If the UNet has only 4 channels, then the unmasked areas will always be forced - to remain unchanged, and setting `force_unmasked_unchanged` to `False` in this case will raise an - error. + with 9 UNet channels. If the UNet has only 4 channels, then the unmasked areas will always be forced to + remain unchanged, and setting `force_unmasked_unchanged` to `False` in this case will raise an error. Examples: diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 616471b2c4fe..e2ab32e9cf45 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -266,7 +266,7 @@ def test_stable_diffusion_inpaint_strength_zero_test(self): inputs["strength"] = 0.01 with self.assertRaises(ValueError): sd_pipe(**inputs).images - + def test_stable_diffusion_inpaint_force_unmasked_unchanged(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() @@ -364,7 +364,7 @@ def test_stable_diffusion_inpaint(self): expected_slice = np.array([0.4925, 0.4967, 0.4100, 0.5234, 0.5322, 0.4532, 0.5805, 0.5877, 0.4151]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - + def test_stable_diffusion_inpaint_force_unmasked_unchanged_false(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() From 6bd35cc187e9542eef1d05d08c9d8ec6a2fb8205 Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Fri, 25 Aug 2023 01:10:05 -0700 Subject: [PATCH 05/10] start working on note in docs to force unmasked area to be unchanged --- docs/source/en/using-diffusers/inpaint.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md index 228e14e84833..51143110fa0a 100644 --- a/docs/source/en/using-diffusers/inpaint.md +++ b/docs/source/en/using-diffusers/inpaint.md @@ -74,3 +74,11 @@ Check out the Spaces below to try out image inpainting yourself! width="850" height="500" > + +## Preserving the Unmasked Area of the Image + +Generally speaking, [`StableDiffusionInpaintPipeline`] (and other inpainting pipelines) will change the unmasked part of the image as well to make the transitions to the repainted masked area more natural. If this behavior is undesirable, you can force the unmasked area to remain the same as follows: + +```python +# TODO: add example code +``` From dffc83ba3ae8edf7acf3fd3a18584cddc31789f1 Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Wed, 30 Aug 2023 02:10:18 -0700 Subject: [PATCH 06/10] Add example of forcing the unmasked area to remain unchanged. --- docs/source/en/using-diffusers/inpaint.md | 42 +++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md index 51143110fa0a..c1873e8826f6 100644 --- a/docs/source/en/using-diffusers/inpaint.md +++ b/docs/source/en/using-diffusers/inpaint.md @@ -77,8 +77,46 @@ Check out the Spaces below to try out image inpainting yourself! ## Preserving the Unmasked Area of the Image -Generally speaking, [`StableDiffusionInpaintPipeline`] (and other inpainting pipelines) will change the unmasked part of the image as well to make the transitions to the repainted masked area more natural. If this behavior is undesirable, you can force the unmasked area to remain the same as follows: +Generally speaking, [`StableDiffusionInpaintPipeline`] (and other inpainting pipelines) will change the unmasked part of the image as well. If this behavior is undesirable, you can force the unmasked area to remain the same as follows: ```python -# TODO: add example code +import PIL +import numpy as np +import torch + +from diffusers import StableDiffusionInpaintPipeline +from diffusers.utils import load_image + +device = "cuda" +pipeline = StableDiffusionInpaintPipeline.from_pretrained( + "runwayml/stable-diffusion-inpainting", + torch_dtype=torch.float16, +) +pipeline = pipeline.to(device) + +img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" +mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + +init_image = load_image(img_url).resize((512, 512)) +mask_image = load_image(mask_url).resize((512, 512)) + +prompt = "Face of a yellow cat, high resolution, sitting on a park bench" +repainted_image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0] +repainted_image.save("repainted_image.png") + +# Convert mask to grayscale NumPy array +mask_image_arr = np.array(mask_image.convert("L")) +# Add a channel dimension to the end of the grayscale mask +mask_image_arr = mask_image_arr[:, :, None] +# Binarize the mask: 1s correspond to the pixels which are repainted +mask_image_arr = mask_image_arr.astype(np.float32) / 255.0 +mask_image_arr[mask_image_arr < 0.5] = 0 +mask_image_arr[mask_image_arr >= 0.5] = 1 + +# Take the masked pixels from the repainted image and the unmasked pixels from the initial image +unmasked_unchanged_image_arr = (1 - mask_image_arr) * init_image_arr + mask_image_arr * repainted_image_arr +unmasked_unchanged_image = PIL.Image.fromarray(unmasked_unchanged_image_arr.round().astype("uint8")) +unmasked_unchanged_image.save("force_unmasked_unchanged.png") ``` + +Forcing the unmasked portion of the image to remain the same might result in some weird transitions between the unmasked and masked areas, since the model will typically change the masked and unmasked areas to make the transition more natural. From 2f6dc317028bc5d7132bff99d58ead28c1d08a61 Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Wed, 30 Aug 2023 18:40:49 -0700 Subject: [PATCH 07/10] Revert "make style" This reverts commit fa7759293af916c665f683387a52fd789ed10531. --- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 7 ++++--- .../stable_diffusion/test_stable_diffusion_inpaint.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 0c6fb0729e7c..2a1815cf6f8e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -585,7 +585,7 @@ def prepare_latents( "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." "However, either the image or the noise timestep has not been provided." ) - + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) if return_image_latents or (latents is None and not is_strength_max): @@ -768,8 +768,9 @@ def __call__( [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). force_unmasked_unchanged (`bool`, *optional*): Whether to force the unmasked areas of `image` to remain exactly the same after inpainting for a model - with 9 UNet channels. If the UNet has only 4 channels, then the unmasked areas will always be forced to - remain unchanged, and setting `force_unmasked_unchanged` to `False` in this case will raise an error. + with 9 UNet channels. If the UNet has only 4 channels, then the unmasked areas will always be forced + to remain unchanged, and setting `force_unmasked_unchanged` to `False` in this case will raise an + error. Examples: diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index e2ab32e9cf45..616471b2c4fe 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -266,7 +266,7 @@ def test_stable_diffusion_inpaint_strength_zero_test(self): inputs["strength"] = 0.01 with self.assertRaises(ValueError): sd_pipe(**inputs).images - + def test_stable_diffusion_inpaint_force_unmasked_unchanged(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() @@ -364,7 +364,7 @@ def test_stable_diffusion_inpaint(self): expected_slice = np.array([0.4925, 0.4967, 0.4100, 0.5234, 0.5322, 0.4532, 0.5805, 0.5877, 0.4151]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - + def test_stable_diffusion_inpaint_force_unmasked_unchanged_false(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() From 99e14421e9fa6164c356dd4819e5cd1ed826d18e Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Wed, 30 Aug 2023 18:40:59 -0700 Subject: [PATCH 08/10] Revert "Use original mask to preserve unmasked pixels in pixel space rather than latent space." This reverts commit 092bd0e9e9628f02ff917aafb2a5c9252776b6e9. --- .../pipeline_stable_diffusion_inpaint.py | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 2a1815cf6f8e..38c2e79d7d32 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -982,18 +982,18 @@ def __call__( # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - if num_channels_unet == 4: - init_latents_proper = image_latents[:1] - init_mask = mask[:1] + if num_channels_unet == 4 and i < len(timesteps) - 1: + # add noise for next timestep + noise_timestep = timesteps[i + 1] - if i < len(timesteps) - 1: - # add noise for next timestep - noise_timestep = timesteps[i + 1] - init_latents_proper = self.scheduler.add_noise( - init_latents_proper, noise, torch.tensor([noise_timestep]) - ) + init_latents_proper = self.scheduler.add_noise( + image_latents[:1], noise, torch.tensor([noise_timestep]) + ) - latents = (1 - init_mask) * init_latents_proper + init_mask * latents + latents = (1 - mask[:1]) * init_latents_proper + mask[:1] * latents + + if force_unmasked_unchanged and i == (len(timesteps) - 1): + latents = (1 - mask[:1]) * image_latents[:1] + mask[:1] * latents # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): @@ -1010,11 +1010,6 @@ def __call__( mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype) condition_kwargs = {"image": init_image_condition, "mask": mask_condition} image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, **condition_kwargs)[0] - # If force_unmasked_unchanged, use the original mask in pixel space to recover the original pixels - if force_unmasked_unchanged: - # Make sure image is on CPU - image = image.cpu() - image = (1 - mask_condition[:1]) * init_image[:1] + mask_condition[:1] * image image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) else: image = latents From e2f045fbebbb2e5314be4d5faac6922a6320609c Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Wed, 30 Aug 2023 18:41:02 -0700 Subject: [PATCH 09/10] Revert "Try to improve StableDiffusionInpaintPipelineFastTests.get_dummy_inputs." This reverts commit ff41cf43c5a820057289e0852fb41b3e2074d86d. --- .../test_stable_diffusion_inpaint.py | 55 ++++++++----------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 616471b2c4fe..75a0212474ba 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -144,31 +144,16 @@ def get_dummy_components(self): } return components - def get_dummy_inputs(self, device, seed=0, img_res=64, output_pil=True): + def get_dummy_inputs(self, device, seed=0): # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched - if output_pil: - # Get random floats in [0, 1] as image - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - mask_image = torch.ones_like(image) - # Convert image and mask_image to [0, 255] - image = 255 * image - mask_image = 255 * mask_image - # Convert to PIL image - init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((img_res, img_res)) - mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB").resize((img_res, img_res)) - else: - # Get random floats in [0, 1] as image with spatial size (img_res, img_res) - image = floats_tensor((1, 3, img_res, img_res), rng=random.Random(seed)).to(device) - # Convert image to [-1, 1] - init_image = 2.0 * image - 1.0 - mask_image = torch.ones((1, 1, img_res, img_res), device=device) - + image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64)) if str(device).startswith("mps"): generator = torch.manual_seed(seed) else: generator = torch.Generator(device=device).manual_seed(seed) - inputs = { "prompt": "A painting of a squirrel eating a burger", "image": init_image, @@ -274,24 +259,28 @@ def test_stable_diffusion_inpaint_force_unmasked_unchanged(self): sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) - inputs = self.get_dummy_inputs(device, output_pil=False) - init_image = inputs["image"].detach().clone() - mask_image = inputs["mask_image"].detach().clone() - # Map init_image to [0, 1] - init_image = (init_image / 2 + 0.5).clamp(0, 1) - # Get image slice - original_image_slice = init_image[0, -1, -3:, -3:].flatten().numpy() - # Unmask the same slice in mask_image - mask_image[0, 0, -3:, -3:] = 0 - + # Get 32 x 32 image manually + image = floats_tensor((1, 3, 32, 32), rng=random.Random(0)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + original_image_slice = image[-3:, -3:, :].numpy() + init_image = Image.fromarray(np.uint8(255 * image)).convert("RGB") + + # mask_array = image + 4 + mask_array = image + 250 + # # Make some pixels unmasked + # mask_array[-3:, -3:, :] = 255 + mask_array[-3:, -3:, :] = 4 + mask_image = Image.fromarray(np.uint8(mask_array)).convert("RGB") + + inputs = self.get_dummy_inputs(device) + inputs["image"] = init_image inputs["mask_image"] = mask_image inputs["force_unmasked_unchanged"] = True - inputs["output_type"] = "pt" image = sd_pipe(**inputs).images - output_image_slice = image[0, -1, -3:, -3:].flatten().numpy() - assert np.abs(original_image_slice - output_image_slice).max() < 1e-3 + output_image_slice = image[0, -3:, -3:, :] + assert np.abs(original_image_slice.flatten() - output_image_slice.flatten()).max() < 1e-3 class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipelineFastTests): From 3fd0aa383219b7b0b43209f42c2449b9c894656f Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Wed, 30 Aug 2023 18:41:04 -0700 Subject: [PATCH 10/10] Revert "Initial code to add force_unmasked_unchanged argument to StableDiffusionInpaintPipeline.__call__." This reverts commit 989979752a1c9e6e16e92d8f1b3d14b70829fbb5. --- .../pipeline_stable_diffusion_inpaint.py | 49 +++++-------------- .../test_stable_diffusion_inpaint.py | 44 ----------------- 2 files changed, 13 insertions(+), 80 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 38c2e79d7d32..ab2abdc05f51 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -585,14 +585,13 @@ def prepare_latents( "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." "However, either the image or the noise timestep has not been provided." ) - - noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) if return_image_latents or (latents is None and not is_strength_max): image = image.to(device=device, dtype=dtype) image_latents = self._encode_vae_image(image=image, generator=generator) if latents is None: + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) # if strength is 1. then initialise the latents to noise, else initial to image + noise latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) # if pure noise then scale the initial latents by the Scheduler's init sigma @@ -699,7 +698,6 @@ def __call__( callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - force_unmasked_unchanged: Optional[bool] = None, ): r""" The call function to the pipeline for generation. @@ -766,12 +764,6 @@ def __call__( cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - force_unmasked_unchanged (`bool`, *optional*): - Whether to force the unmasked areas of `image` to remain exactly the same after inpainting for a model - with 9 UNet channels. If the UNet has only 4 channels, then the unmasked areas will always be forced - to remain unchanged, and setting `force_unmasked_unchanged` to `False` in this case will raise an - error. - Examples: @@ -881,21 +873,7 @@ def __call__( # 6. Prepare latent variables num_channels_latents = self.vae.config.latent_channels num_channels_unet = self.unet.config.in_channels - - # return_image_latents = num_channels_unet == 4 or force_unmasked_unchanged - if num_channels_unet == 4 and force_unmasked_unchanged is False: - raise ValueError( - "Cannot set `force_unmasked_unchanged=False` for inpainting if the UNet has only 4 input channels." - " Either set `force_unmasked_unchanged` to `True` or use a UNet checkpoint that has been trained" - " specifically for inpainting." - ) - elif num_channels_unet == 4 and force_unmasked_unchanged is None: - # For checkpoints not trained for inpainting the unmasked area should by default not be changed - force_unmasked_unchanged = True - elif num_channels_unet > 4 and force_unmasked_unchanged is None: - # For checkpoints trained for inpainting the unmasked area should by default be allowed to change so that - # the model can make the transition between the inpainted and non-inpainted area more natural - force_unmasked_unchanged = False + return_image_latents = num_channels_unet == 4 latents_outputs = self.prepare_latents( batch_size * num_images_per_prompt, @@ -910,10 +888,10 @@ def __call__( timestep=latent_timestep, is_strength_max=is_strength_max, return_noise=True, - return_image_latents=force_unmasked_unchanged, + return_image_latents=return_image_latents, ) - if force_unmasked_unchanged: + if return_image_latents: latents, noise, image_latents = latents_outputs else: latents, noise = latents_outputs @@ -982,18 +960,17 @@ def __call__( # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - if num_channels_unet == 4 and i < len(timesteps) - 1: - # add noise for next timestep - noise_timestep = timesteps[i + 1] + if num_channels_unet == 4: + init_latents_proper = image_latents[:1] + init_mask = mask[:1] - init_latents_proper = self.scheduler.add_noise( - image_latents[:1], noise, torch.tensor([noise_timestep]) - ) + if i < len(timesteps) - 1: + noise_timestep = timesteps[i + 1] + init_latents_proper = self.scheduler.add_noise( + init_latents_proper, noise, torch.tensor([noise_timestep]) + ) - latents = (1 - mask[:1]) * init_latents_proper + mask[:1] * latents - - if force_unmasked_unchanged and i == (len(timesteps) - 1): - latents = (1 - mask[:1]) * image_latents[:1] + mask[:1] * latents + latents = (1 - init_mask) * init_latents_proper + init_mask * latents # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 75a0212474ba..16fff0c13dd2 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -251,36 +251,6 @@ def test_stable_diffusion_inpaint_strength_zero_test(self): inputs["strength"] = 0.01 with self.assertRaises(ValueError): sd_pipe(**inputs).images - - def test_stable_diffusion_inpaint_force_unmasked_unchanged(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionInpaintPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - # Get 32 x 32 image manually - image = floats_tensor((1, 3, 32, 32), rng=random.Random(0)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - original_image_slice = image[-3:, -3:, :].numpy() - init_image = Image.fromarray(np.uint8(255 * image)).convert("RGB") - - # mask_array = image + 4 - mask_array = image + 250 - # # Make some pixels unmasked - # mask_array[-3:, -3:, :] = 255 - mask_array[-3:, -3:, :] = 4 - mask_image = Image.fromarray(np.uint8(mask_array)).convert("RGB") - - inputs = self.get_dummy_inputs(device) - inputs["image"] = init_image - inputs["mask_image"] = mask_image - inputs["force_unmasked_unchanged"] = True - - image = sd_pipe(**inputs).images - - output_image_slice = image[0, -3:, -3:, :] - assert np.abs(original_image_slice.flatten() - output_image_slice.flatten()).max() < 1e-3 class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipelineFastTests): @@ -353,20 +323,6 @@ def test_stable_diffusion_inpaint(self): expected_slice = np.array([0.4925, 0.4967, 0.4100, 0.5234, 0.5322, 0.4532, 0.5805, 0.5877, 0.4151]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_inpaint_force_unmasked_unchanged_false(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionInpaintPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - - # check that the pipeline raises value error when num_unet_channels == 4 and force_masked_unchanged == False - inputs["force_unmasked_unchanged"] = False - with self.assertRaises(ValueError): - sd_pipe(**inputs).images @unittest.skip("skipped here because area stays unchanged due to mask") def test_stable_diffusion_inpaint_lora(self):